Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from gradio_client import Client | |
| import json | |
| import re | |
| from moviepy.editor import VideoFileClip | |
| from moviepy.audio.AudioClip import AudioClip | |
| def extract_audio(video_in): | |
| input_video = video_in | |
| output_audio = 'audio.wav' | |
| # Open the video file and extract the audio | |
| video_clip = VideoFileClip(input_video) | |
| audio_clip = video_clip.audio | |
| # Save the audio as a .wav file | |
| audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files | |
| print("Audio extraction complete.") | |
| return 'audio.wav' | |
| def get_caption_from_kosmos(image_in): | |
| kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/") | |
| kosmos2_result = kosmos2_client.predict( | |
| image_in, # str (filepath or URL to image) in 'Test Image' Image component | |
| "Detailed", # str in 'Description Type' Radio component | |
| fn_index=4 | |
| ) | |
| print(f"KOSMOS2 RETURNS: {kosmos2_result}") | |
| with open(kosmos2_result[1], 'r') as f: | |
| data = json.load(f) | |
| reconstructed_sentence = [] | |
| for sublist in data: | |
| reconstructed_sentence.append(sublist[0]) | |
| full_sentence = ' '.join(reconstructed_sentence) | |
| #print(full_sentence) | |
| # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)... | |
| pattern = r'^Describe this image in detail:\s*(.*)$' | |
| # Apply the regex pattern to extract the description text. | |
| match = re.search(pattern, full_sentence) | |
| if match: | |
| description = match.group(1) | |
| print(description) | |
| else: | |
| print("Unable to locate valid description.") | |
| # Find the last occurrence of "." | |
| last_period_index = description.rfind('.') | |
| # Truncate the string up to the last period | |
| truncated_caption = description[:last_period_index + 1] | |
| # print(truncated_caption) | |
| print(f"\n—\nIMAGE CAPTION: {truncated_caption}") | |
| return truncated_caption | |
| def get_caption(image_in): | |
| client = Client("https://vikhyatk-moondream1.hf.space/") | |
| result = client.predict( | |
| image_in, # filepath in 'image' Image component | |
| "provided the given image caption, generate a one sentence long description of an appropriate sound effect for the context", # str in 'Question' Textbox component | |
| api_name="/answer_question" | |
| ) | |
| print(result) | |
| return result | |
| def get_audioldm(prompt): | |
| client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/") | |
| result = client.predict( | |
| prompt, | |
| "low quality", | |
| 10, | |
| 3.5, | |
| 45, | |
| 3, | |
| fn_index=1 | |
| ) | |
| print(result) | |
| audio_result = extract_audio(result) | |
| return audio_result | |
| def infer(image_in, chosen_model): | |
| caption = get_caption(image_in) | |
| if chosen_model == "MAGNet" : | |
| magnet_result = get_magnet(caption) | |
| return magnet_result | |
| elif chosen_model == "AudioLDM-2" : | |
| audioldm_result = get_audioldm(caption) | |
| return audioldm_result | |
| elif chosen_model == "AudioGen" : | |
| audiogen_result = get_audiogen(caption) | |
| return audiogen_result | |
| css=""" | |
| #col-container{ | |
| margin: 0 auto; | |
| max-width: 800px; | |
| } | |
| """ | |
| with gr.Blocks(css=css) as demo: | |
| with gr.Column(elem_id="col-container"): | |
| gr.HTML(""" | |
| <h2 style="text-align: center;"> | |
| Image to SFX | |
| </h2> | |
| <p style="text-align: center;"> | |
| Compare MAGNet, AudioLDM2 and AudioGen sound effects generation from image caption. | |
| </p> | |
| """) | |
| with gr.Column(): | |
| image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="doggy.jpg") | |
| with gr.Row(): | |
| chosen_model = gr.Radio(label="Choose a model", choices=["AudioLDM-2"], value="AudioLDM-2") | |
| submit_btn = gr.Button("Submit") | |
| with gr.Column(): | |
| audio_o = gr.Audio(label="Audio output") | |
| submit_btn.click( | |
| fn=infer, | |
| inputs=[image_in, chosen_model], | |
| outputs=[audio_o], | |
| concurrency_limit = 4 | |
| ) | |
| demo.queue(max_size=10).launch(debug=True) |