Text-Extractor / app.py
rajaramesh's picture
Add application file
6dcca02
raw
history blame
3.69 kB
import gradio as gr
import requests
from langchain_community.document_loaders import UnstructuredURLLoader
from youtube_transcript_api import YouTubeTranscriptApi
import subprocess
def text_extract(generic_url: str) -> str:
"""
Extract the text from any website or youtube video.
Args:
url (str): the url of website or youtube to extract text from it
Returns:
str: A string containing text extracted from website or youtube
"""
final_text=""
if not generic_url.strip():
print("Please provide the information to get started")
return "Please provide the information to get started"
else:
try:
# Another level checking if the url is valid or not
response = requests.get(generic_url, timeout=5)
# response = requests.get(generic_url, verify=False)
if response.status_code == 200:
print("URL is valid and reachable.")
else:
print("Unable to reach")
# loading the website or yt video data
if "youtube.com" in generic_url:
video_id = generic_url.split("v=")[-1]
transcript = YouTubeTranscriptApi.get_transcript(video_id=video_id)
final_text = " ".join([entry['text'] for entry in transcript])
else:
loader=UnstructuredURLLoader(urls=[generic_url],ssl_verify=False,
headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"})
docs=loader.load()
text=docs[0].page_content
cleaned_lines = [line.strip() for line in text.split("\n") if line.strip()]
final_text = "\n".join(cleaned_lines)
except requests.exceptions.ConnectionError as e:
print("Error reaching the URL:", e)
return "Pls enter valid url we have encounterd ConnectionError\n"+str(e)
except requests.exceptions.RequestException as e:
print("Error reaching the URL:", e)
return "Pls enter valid url we have encounterd RequestException\n"+str(e)
except Exception as e:
print(f"Exception:{e}")
return "We have encounterd the following error\n"+str(e)
return final_text
def terminal(command: str) -> str:
"""Execute a terminal command and return the output
Args:
command: The command to execute
Returns:
The command output (stdout and stderr combined)
"""
return (
"# Hey you are accessing a dummy terminal. \n"
"- Its very dangerous to exposing a terminal as a tool to public. \n"
"- If you want this terminal tool working in action, then checkout my youtube video: "
)
# Create multiple interfaces
text_extract_fn = gr.Interface(
fn=text_extract,
inputs=gr.Textbox(placeholder="Paste any website or youtube video url"),
outputs=gr.Textbox(placeholder="Text extracted from website or youtube video"),
title="Text Extractor",
description="Extract the text from any website or youtube video."
)
terminal_fn = gr.Interface(
fn=terminal,
inputs=gr.Textbox(placeholder="Enter you command"),
outputs="markdown",
flagging_mode="never", # Disables the flag button
title="Shell Server",
description="Runs the shell commands on your computer."
)
# Combine using tabs
demo = gr.TabbedInterface(
[text_extract_fn, terminal_fn],
["Text Extractor", "Command Terminal"]
)
if __name__ == "__main__":
demo.launch(mcp_server=True)