Spaces:
Paused
Paused
| import requests | |
| from newspaper import Article | |
| import pandas as pd | |
| import time | |
| from pathlib import Path | |
| import re | |
| from typing import List | |
| def extract_article_text(urls): | |
| """Extract text content from a list of article URLs""" | |
| articles = [] | |
| for url in urls: | |
| try: | |
| article = Article(url) | |
| article.download() | |
| article.parse() | |
| articles.append({ | |
| 'url': url, | |
| 'title': article.title, | |
| 'text': article.text, | |
| 'authors': article.authors, | |
| 'publish_date': article.publish_date, | |
| 'source': url.split('/')[2] # Extract domain | |
| }) | |
| time.sleep(1) # Be respectful to servers | |
| except Exception as e: | |
| print(f"Failed to extract {url}: {e}") | |
| return pd.DataFrame(articles) | |
| def clean_text(text: str) -> str: | |
| """Clean text by removing extra whitespace and special characters""" | |
| # Remove extra whitespace, special characters | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r'[^\w\s.,!?-]', '', text) | |
| return text.strip() | |
| def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]: | |
| """Split text into overlapping chunks""" | |
| words = text.split() | |
| chunks = [] | |
| for i in range(0, len(words), chunk_size - overlap): | |
| chunk = ' '.join(words[i:i + chunk_size]) | |
| chunks.append(chunk) | |
| return chunks | |
| def process_articles(urls: List[str], save_path: str = None) -> pd.DataFrame: | |
| """Complete pipeline to extract, clean, and process articles""" | |
| print(f"Extracting text from {len(urls)} articles...") | |
| # Extract articles | |
| df = extract_article_text(urls) | |
| # Clean text | |
| df['cleaned_text'] = df['text'].apply(clean_text) | |
| # Create chunks for each article | |
| df['text_chunks'] = df['cleaned_text'].apply( | |
| lambda x: chunk_text(x) if pd.notna(x) else [] | |
| ) | |
| # Save if path provided | |
| if save_path: | |
| df.to_csv(save_path, index=False) | |
| print(f"Results saved to {save_path}") | |
| return df | |
| if __name__ == "__main__": | |
| # Example usage | |
| sample_urls = [ | |
| "https://example.com/article1", | |
| "https://example.com/article2" | |
| ] | |
| # Process articles | |
| # df = process_articles(sample_urls, "extracted_articles.csv") | |
| # print(f"Extracted {len(df)} articles") | |
| print("Article extractor ready! Use process_articles() with your URLs.") |