Par-ity_Project / article_extractor.py
chenemii's picture
Update Par-ity Project with enhanced features
959d1ac
import requests
from newspaper import Article
import pandas as pd
import time
from pathlib import Path
import re
from typing import List
def extract_article_text(urls):
"""Extract text content from a list of article URLs"""
articles = []
for url in urls:
try:
article = Article(url)
article.download()
article.parse()
articles.append({
'url': url,
'title': article.title,
'text': article.text,
'authors': article.authors,
'publish_date': article.publish_date,
'source': url.split('/')[2] # Extract domain
})
time.sleep(1) # Be respectful to servers
except Exception as e:
print(f"Failed to extract {url}: {e}")
return pd.DataFrame(articles)
def clean_text(text: str) -> str:
"""Clean text by removing extra whitespace and special characters"""
# Remove extra whitespace, special characters
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s.,!?-]', '', text)
return text.strip()
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
"""Split text into overlapping chunks"""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = ' '.join(words[i:i + chunk_size])
chunks.append(chunk)
return chunks
def process_articles(urls: List[str], save_path: str = None) -> pd.DataFrame:
"""Complete pipeline to extract, clean, and process articles"""
print(f"Extracting text from {len(urls)} articles...")
# Extract articles
df = extract_article_text(urls)
# Clean text
df['cleaned_text'] = df['text'].apply(clean_text)
# Create chunks for each article
df['text_chunks'] = df['cleaned_text'].apply(
lambda x: chunk_text(x) if pd.notna(x) else []
)
# Save if path provided
if save_path:
df.to_csv(save_path, index=False)
print(f"Results saved to {save_path}")
return df
if __name__ == "__main__":
# Example usage
sample_urls = [
"https://example.com/article1",
"https://example.com/article2"
]
# Process articles
# df = process_articles(sample_urls, "extracted_articles.csv")
# print(f"Extracted {len(df)} articles")
print("Article extractor ready! Use process_articles() with your URLs.")