Spaces:

coqui
/

voice-chat-with-mistral

Paused

App Files Files Community

ggoknar commited on Oct 17, 2023

Commit

d3d83c1

1 Parent(s): f34dc34

Fixed STT to TTS and uses streaming TTS

Browse files

Files changed (1) hide show

app.py +146 -58

app.py CHANGED Viewed

@@ -19,6 +19,7 @@ from scipy.io.wavfile import write
 from pydub import AudioSegment
 import ffmpeg
 import librosa
 import torchaudio
 from TTS.api import TTS
@@ -31,7 +32,6 @@ from TTS.utils.generic_utils import get_user_data_dir
 # Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
 AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 1))
 # This will trigger downloading model
 print("Downloading if not downloaded Coqui XTTS V1")
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
@@ -68,7 +68,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
-repo_id = "ylacombe/voice-chat-with-lama"
 default_system_message = """
 You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
@@ -97,12 +97,15 @@ import numpy as np
 from gradio_client import Client
 from huggingface_hub import InferenceClient
 # This client is down
 # whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
 # Replacement whisper client, it may be time limited
 whisper_client = Client("https://sanchit-gandhi-whisper-jax.hf.space")
-text_client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
 ###### COQUI TTS FUNCTIONS ######
@@ -180,13 +183,17 @@ def generate(
 def transcribe(wav_path):
-    # get first element from whisper_jax and strip it to delete begin and end space
-    return whisper_client.predict(
-        wav_path,  # str (filepath or URL to file) in 'inputs' Audio component
-        "transcribe",  # str in 'Task' Radio component
-        False,  # return_timestamps=False for whisper-jax https://gist.github.com/sanchit-gandhi/781dd7003c5b201bfe16d28634c8d4cf#file-whisper_jax_endpoint-py
-        api_name="/predict",
-    )[0].strip()
 # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
@@ -257,6 +264,73 @@ def get_voice(prompt, language, latent_tuple, suffix="0"):
     return wav_filename
 def get_sentence(history, system_prompt=""):
     history = [] if history is None else history
@@ -322,55 +396,33 @@ def generate_speech(history):
         try:
             # generate speech using precomputed latents
             # This is not streaming but it will be fast
-            wav = get_voice(
                 sentence, language, latent_map["Female_Voice"], suffix=len(wav_list)
             )
-            wav_list.append(wav)
-            yield (gr.Audio.update(value=wav, autoplay=True), history)
-            wait_time = librosa.get_duration(path=wav)
             wait_time = AUDIO_WAIT_MODIFIER * wait_time
             print("Sleeping till audio end")
             time.sleep(wait_time)
-            # Replace inside try with below to use streaming, though not perfectly working as each it will multiprocess with mistral generation
-            # And would produce artifacts
-            # giving sentence suffix so we can merge all to single audio at end
-            # On mobile there is no autoplay support due to mobile security!
-            """
-            t_inference = time.time()
-            chunks = model.inference_stream(
-                sentence,
-                language,
-                latent_map["Female_Voice"][0],
-                latent_map["Female_Voice"][2],)
-            first_chunk=True
-            wav_chunks=[]
-            for i, chunk in enumerate(chunks):
-                if first_chunk:
-                    first_chunk_time = time.time() - t_inference
-                    print(f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n")
-                    first_chunk=False
-                wav_chunks.append(chunk)
-                print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
-                out_file = f'{i}.wav'
-                write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
-                audio = AudioSegment.from_file(out_file)
-                audio.export(out_file, format='wav')
-                yield (gr.Audio.update(value=out_file,autoplay=True) , history)
-                #chunk sleep else next sentence may come in fast
-                wait_time= librosa.get_duration(path=out_file)
-                time.sleep(wait_time)
-            wav = torch.cat(wav_chunks, dim=0)
-            filename= f"output_{len(wav_list)}.wav"
-            torchaudio.save(filename, wav.squeeze().unsqueeze(0).cpu(), 24000)
-            wav_list.append(filename)
-            """
         except RuntimeError as e:
             if "device-side assert" in str(e):
                 # cannot do anything on cuda device side error, need tor estart
@@ -387,6 +439,40 @@ def generate_speech(history):
                 print("RuntimeError: non device-side assert error:", str(e))
                 raise e
 with gr.Blocks(title=title) as demo:
     gr.Markdown(DESCRIPTION)
@@ -410,12 +496,14 @@ with gr.Blocks(title=title) as demo:
     with gr.Row():
         audio = gr.Audio(
-            type="numpy",
             streaming=False,
             autoplay=False,
-            label="Generated audio response",
             show_label=True,
         )
     clear_btn = gr.ClearButton([chatbot, audio])
@@ -432,7 +520,7 @@ with gr.Blocks(title=title) as demo:
     txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
     file_msg = btn.stop_recording(
-        add_file, [chatbot, btn], [chatbot], queue=False
     ).then(generate_speech, chatbot, [audio, chatbot])
     gr.Markdown(

 from pydub import AudioSegment
 import ffmpeg
+import io, wave
 import librosa
 import torchaudio
 from TTS.api import TTS
 # Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
 AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 1))
 # This will trigger downloading model
 print("Downloading if not downloaded Coqui XTTS V1")
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
+repo_id = "coqui/voice-chat-with-lama"
 default_system_message = """
 You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
 from gradio_client import Client
 from huggingface_hub import InferenceClient
+WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 30))
 # This client is down
 # whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
 # Replacement whisper client, it may be time limited
 whisper_client = Client("https://sanchit-gandhi-whisper-jax.hf.space")
+text_client = InferenceClient(
+    "mistralai/Mistral-7B-Instruct-v0.1",
+    timeout=WHISPER_TIMEOUT,
+)
 ###### COQUI TTS FUNCTIONS ######
 def transcribe(wav_path):
+    try:
+        # get first element from whisper_jax and strip it to delete begin and end space
+        return whisper_client.predict(
+            wav_path,  # str (filepath or URL to file) in 'inputs' Audio component
+            "transcribe",  # str in 'Task' Radio component
+            False,  # return_timestamps=False for whisper-jax https://gist.github.com/sanchit-gandhi/781dd7003c5b201bfe16d28634c8d4cf#file-whisper_jax_endpoint-py
+            api_name="/predict",
+        )[0].strip()
+    except:
+        gr.Warning("There was a problem with Whisper endpoint, telling a joke for you.")
+        return "There was a problem with my voice, tell me joke"
 # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
     return wav_filename
+def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
+    # This will create a wave header then append the frame input
+    # It should be first on a streaming wav file
+    # Other frames better should not have it (else you will hear some artifacts each chunk start)
+    wav_buf = io.BytesIO()
+    with wave.open(wav_buf, "wb") as vfout:
+        vfout.setnchannels(channels)
+        vfout.setsampwidth(sample_width)
+        vfout.setframerate(sample_rate)
+        vfout.writeframes(frame_input)
+    wav_buf.seek(0)
+    return wav_buf.read()
+def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
+    gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
+    try:
+        t0 = time.time()
+        chunks = model.inference_stream(
+            prompt,
+            language,
+            gpt_cond_latent,
+            speaker_embedding,
+        )
+        first_chunk = True
+        for i, chunk in enumerate(chunks):
+            if first_chunk:
+                first_chunk_time = time.time() - t0
+                metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
+                first_chunk = False
+            print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
+            # In case output is required to be multiple voice files
+            # out_file = f'{char}_{i}.wav'
+            # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
+            # audio = AudioSegment.from_file(out_file)
+            # audio.export(out_file, format='wav')
+            # return out_file
+            # directly return chunk as bytes for streaming
+            chunk = chunk.detach().cpu().numpy().squeeze()
+            chunk = (chunk * 32767).astype(np.int16)
+            yield chunk.tobytes()
+    except RuntimeError as e:
+        if "device-side assert" in str(e):
+            # cannot do anything on cuda device side error, need tor estart
+            print(
+                f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
+                flush=True,
+            )
+            gr.Warning("Unhandled Exception encounter, please retry in a minute")
+            print("Cuda device-assert Runtime encountered need restart")
+            # HF Space specific.. This error is unrecoverable need to restart space
+            api.restart_space(repo_id=repo_id)
+        else:
+            print("RuntimeError: non device-side assert error:", str(e))
+            gr.Warning("Unhandled Exception encounter, please retry in a minute")
+            return None
+        return None
+    except:
+        return None
 def get_sentence(history, system_prompt=""):
     history = [] if history is None else history
         try:
             # generate speech using precomputed latents
             # This is not streaming but it will be fast
+            # wav = get_voice(sentence,language, latent_map["Female_Voice"], suffix=len(wav_list))
+            audio_stream = get_voice_streaming(
                 sentence, language, latent_map["Female_Voice"], suffix=len(wav_list)
             )
+            wav_chunks = wave_header_chunk()
+            frame_length = 0
+            for chunk in audio_stream:
+                try:
+                    wav_chunks += chunk
+                    frame_length += len(chunk)
+                except:
+                    # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
+                    continue
+            wav_list.append(wav_chunks)
+            yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
+            # Streaming wait time calculation
+            # audio_length = frame_length / sample_width/ frame_rate
+            wait_time = frame_length / 2 / 24000 + 0.5  # plus 500ms
+            # for non streaming
+            # wait_time= librosa.get_duration(path=wav)
             wait_time = AUDIO_WAIT_MODIFIER * wait_time
             print("Sleeping till audio end")
             time.sleep(wait_time)
         except RuntimeError as e:
             if "device-side assert" in str(e):
                 # cannot do anything on cuda device side error, need tor estart
                 print("RuntimeError: non device-side assert error:", str(e))
                 raise e
+    # Spoken on autoplay everysencen now produce a concataned one at the one
+    # requires pip install ffmpeg-python
+    # files_to_concat= [ffmpeg.input(w) for w in wav_list]
+    # combined_file_name="combined.wav"
+    # ffmpeg.concat(*files_to_concat,v=0, a=1).output(combined_file_name).run(overwrite_output=True)
+    # final_audio.update(value=combined_file_name, visible=True)
+    # yield (combined_file_name, history)
+css = """
+.bot .chatbot p {
+  overflow: hidden; /* Ensures the content is not revealed until the animation */
+  //border-right: .15em solid orange; /* The typwriter cursor */
+  white-space: nowrap; /* Keeps the content on a single line */
+  margin: 0 auto; /* Gives that scrolling effect as the typing happens */
+  letter-spacing: .15em; /* Adjust as needed */
+  animation:
+    typing 3.5s steps(40, end);
+    blink-caret .75s step-end infinite;
+}
+/* The typing effect */
+@keyframes typing {
+  from { width: 0 }
+  to { width: 100% }
+}
+/* The typewriter cursor effect */
+@keyframes blink-caret {
+  from, to { border-color: transparent }
+  50% { border-color: orange; }
+}
+"""
 with gr.Blocks(title=title) as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Row():
         audio = gr.Audio(
+            label="Generated audio response",
             streaming=False,
             autoplay=False,
+            interactive=True,
             show_label=True,
         )
+        # TODO add a second audio that plays whole sentences (for mobile especially)
+        # final_audio = gr.Audio(label="Final audio response", streaming=False, autoplay=False, interactive=False,show_label=True, visible=False)
     clear_btn = gr.ClearButton([chatbot, audio])
     txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
     file_msg = btn.stop_recording(
+        add_file, [chatbot, btn], [chatbot, txt], queue=False
     ).then(generate_speech, chatbot, [audio, chatbot])
     gr.Markdown(