Spaces:

chenemii
/

Par-ity_Project

Paused

App Files Files Community

Par-ity_Project / article_extractor.py

chenemii

Update Par-ity Project with enhanced features

959d1ac 5 months ago

raw

history blame contribute delete

2.51 kB

	import requests
	from newspaper import Article
	import pandas as pd
	import time
	from pathlib import Path
	import re
	from typing import List

	def extract_article_text(urls):
	"""Extract text content from a list of article URLs"""
	articles = []
	for url in urls:
	try:
	article = Article(url)
	article.download()
	article.parse()

	articles.append({
	'url': url,
	'title': article.title,
	'text': article.text,
	'authors': article.authors,
	'publish_date': article.publish_date,
	'source': url.split('/')[2] # Extract domain
	})
	time.sleep(1) # Be respectful to servers
	except Exception as e:
	print(f"Failed to extract {url}: {e}")

	return pd.DataFrame(articles)

	def clean_text(text: str) -> str:
	"""Clean text by removing extra whitespace and special characters"""
	# Remove extra whitespace, special characters
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'[^\w\s.,!?-]', '', text)
	return text.strip()

	def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
	"""Split text into overlapping chunks"""
	words = text.split()
	chunks = []

	for i in range(0, len(words), chunk_size - overlap):
	chunk = ' '.join(words[i:i + chunk_size])
	chunks.append(chunk)

	return chunks

	def process_articles(urls: List[str], save_path: str = None) -> pd.DataFrame:
	"""Complete pipeline to extract, clean, and process articles"""
	print(f"Extracting text from {len(urls)} articles...")

	# Extract articles
	df = extract_article_text(urls)

	# Clean text
	df['cleaned_text'] = df['text'].apply(clean_text)

	# Create chunks for each article
	df['text_chunks'] = df['cleaned_text'].apply(
	lambda x: chunk_text(x) if pd.notna(x) else []
	)

	# Save if path provided
	if save_path:
	df.to_csv(save_path, index=False)
	print(f"Results saved to {save_path}")

	return df

	if __name__ == "__main__":
	# Example usage
	sample_urls = [
	"https://example.com/article1",
	"https://example.com/article2"
	]

	# Process articles
	# df = process_articles(sample_urls, "extracted_articles.csv")
	# print(f"Extracted {len(df)} articles")

	print("Article extractor ready! Use process_articles() with your URLs.")