Spaces:

Bton
/

Image2AudioButBetter

Runtime error

App Files Files Community

Image2AudioButBetter / app.py

Bton

Update app.py

953ffb1 verified almost 2 years ago

raw

history blame contribute delete

4.16 kB

	import gradio as gr
	from gradio_client import Client
	import json
	import re
	from moviepy.editor import VideoFileClip
	from moviepy.audio.AudioClip import AudioClip

	def extract_audio(video_in):
	input_video = video_in
	output_audio = 'audio.wav'

	# Open the video file and extract the audio
	video_clip = VideoFileClip(input_video)
	audio_clip = video_clip.audio

	# Save the audio as a .wav file
	audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files
	print("Audio extraction complete.")

	return 'audio.wav'

	def get_caption_from_kosmos(image_in):
	kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")

	kosmos2_result = kosmos2_client.predict(
	image_in, # str (filepath or URL to image) in 'Test Image' Image component
	"Detailed", # str in 'Description Type' Radio component
	fn_index=4
	)

	print(f"KOSMOS2 RETURNS: {kosmos2_result}")

	with open(kosmos2_result[1], 'r') as f:
	data = json.load(f)

	reconstructed_sentence = []
	for sublist in data:
	reconstructed_sentence.append(sublist[0])

	full_sentence = ' '.join(reconstructed_sentence)
	#print(full_sentence)

	# Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
	pattern = r'^Describe this image in detail:\s(.)$'
	# Apply the regex pattern to extract the description text.
	match = re.search(pattern, full_sentence)
	if match:
	description = match.group(1)
	print(description)
	else:
	print("Unable to locate valid description.")

	# Find the last occurrence of "."
	last_period_index = description.rfind('.')

	# Truncate the string up to the last period
	truncated_caption = description[:last_period_index + 1]

	# print(truncated_caption)
	print(f"\n—\nIMAGE CAPTION: {truncated_caption}")

	return truncated_caption

	def get_caption(image_in):
	client = Client("https://vikhyatk-moondream1.hf.space/")
	result = client.predict(
	image_in, # filepath in 'image' Image component
	"provided the given image caption, generate a one sentence long description of an appropriate sound effect for the context", # str in 'Question' Textbox component
	api_name="/answer_question"
	)
	print(result)
	return result

	def get_audioldm(prompt):
	client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
	result = client.predict(
	prompt,
	"low quality",
	10,
	3.5,
	45,
	3,
	fn_index=1
	)
	print(result)
	audio_result = extract_audio(result)
	return audio_result

	def infer(image_in, chosen_model):
	caption = get_caption(image_in)
	if chosen_model == "MAGNet" :
	magnet_result = get_magnet(caption)
	return magnet_result
	elif chosen_model == "AudioLDM-2" :
	audioldm_result = get_audioldm(caption)
	return audioldm_result
	elif chosen_model == "AudioGen" :
	audiogen_result = get_audiogen(caption)
	return audiogen_result

	css="""
	#col-container{
	margin: 0 auto;
	max-width: 800px;
	}
	"""

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	gr.HTML("""
	<h2 style="text-align: center;">
	Image to SFX
	</h2>
	<p style="text-align: center;">
	Compare MAGNet, AudioLDM2 and AudioGen sound effects generation from image caption.
	</p>
	""")

	with gr.Column():
	image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="doggy.jpg")
	with gr.Row():
	chosen_model = gr.Radio(label="Choose a model", choices=["AudioLDM-2"], value="AudioLDM-2")
	submit_btn = gr.Button("Submit")
	with gr.Column():
	audio_o = gr.Audio(label="Audio output")

	submit_btn.click(
	fn=infer,
	inputs=[image_in, chosen_model],
	outputs=[audio_o],
	concurrency_limit = 4
	)

	demo.queue(max_size=10).launch(debug=True)