Spaces:
Sleeping
Sleeping
Update api.py
Browse files
api.py
CHANGED
|
@@ -80,9 +80,8 @@ def get_content(number: str, node_type: str) -> str:
|
|
| 80 |
logger.error(f"An unexpected error occurred in get_content for {number}: {e}")
|
| 81 |
return ""
|
| 82 |
|
| 83 |
-
def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
| 84 |
"""Extracts information from an Arxiv research paper and generates a summary."""
|
| 85 |
-
raw_content = get_content(rp_number, node_type)
|
| 86 |
|
| 87 |
rp_data = {
|
| 88 |
"document": f"Arxiv {rp_number}", # ID for the paper
|
|
@@ -91,6 +90,8 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
| 91 |
"summary": "Summary not yet generated" # Default summary
|
| 92 |
}
|
| 93 |
|
|
|
|
|
|
|
| 94 |
if not raw_content:
|
| 95 |
logger.warning(f"No content fetched for Arxiv ID: {rp_number}")
|
| 96 |
return rp_data # Returns default error data
|
|
@@ -128,30 +129,114 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
| 128 |
if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag:
|
| 129 |
rp_data["abstract"] = "Abstract not found on page"
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
response = model.generate_content(prompt)
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
else:
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
except Exception as e:
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
|
| 157 |
"""Adds a list of nodes to Neo4j in a single transaction."""
|
|
|
|
| 80 |
logger.error(f"An unexpected error occurred in get_content for {number}: {e}")
|
| 81 |
return ""
|
| 82 |
|
| 83 |
+
def extract_research_paper_arxiv(rp_number: str, node_type: str = "ResearchPaper") -> dict:
|
| 84 |
"""Extracts information from an Arxiv research paper and generates a summary."""
|
|
|
|
| 85 |
|
| 86 |
rp_data = {
|
| 87 |
"document": f"Arxiv {rp_number}", # ID for the paper
|
|
|
|
| 90 |
"summary": "Summary not yet generated" # Default summary
|
| 91 |
}
|
| 92 |
|
| 93 |
+
raw_content = get_content(rp_number, node_type)
|
| 94 |
+
|
| 95 |
if not raw_content:
|
| 96 |
logger.warning(f"No content fetched for Arxiv ID: {rp_number}")
|
| 97 |
return rp_data # Returns default error data
|
|
|
|
| 129 |
if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag:
|
| 130 |
rp_data["abstract"] = "Abstract not found on page"
|
| 131 |
|
| 132 |
+
except Exception as e:
|
| 133 |
+
logger.error(f"Failed to parse content for Arxiv ID {rp_number}: {e}")
|
| 134 |
+
|
| 135 |
+
# Generate summary with Gemini API if available and abstract exists
|
| 136 |
+
if rp_data["abstract"] and \
|
| 137 |
+
not rp_data["abstract"].startswith("Error fetching content") and \
|
| 138 |
+
not rp_data["abstract"].startswith("Abstract not found"):
|
| 139 |
+
|
| 140 |
+
prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
|
| 141 |
+
Focus on challenges, gaps, or novel aspects.
|
| 142 |
+
Here is the document: <document>{rp_data['abstract']}<document>"""
|
|
|
|
| 143 |
|
| 144 |
+
try:
|
| 145 |
+
model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
|
| 146 |
+
response = model.generate_content(prompt)
|
| 147 |
+
|
| 148 |
+
rp_data["summary"] = response.text
|
| 149 |
+
logger.info(f"Summary generated for Arxiv ID: {rp_number}")
|
| 150 |
+
except Exception as e:
|
| 151 |
+
logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
|
| 152 |
+
rp_data["summary"] = "Error generating summary (API failure)"
|
| 153 |
+
else:
|
| 154 |
+
rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
|
| 155 |
+
return rp_data
|
| 156 |
+
|
| 157 |
+
def extract_patent_data(patent_number: str, node_type: str = "Patent"):
|
| 158 |
+
"""
|
| 159 |
+
Extracts information from a Google Patents page with robust error handling.
|
| 160 |
+
"""
|
| 161 |
+
# Initialize a dictionary with default error messages for consistency.
|
| 162 |
+
patent_data = {
|
| 163 |
+
"document": f"Patent {patent_number}",
|
| 164 |
+
"title": "Error fetching content or content not found",
|
| 165 |
+
"description": "Error fetching content or content not found",
|
| 166 |
+
"claim": "Error fetching content or content not found",
|
| 167 |
+
"summary": "Summary not yet generated" # Default summary
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
# Use the generic get_content function to fetch the raw page content.
|
| 171 |
+
raw_content = get_content(patent_number, node_type)
|
| 172 |
+
|
| 173 |
+
if not raw_content:
|
| 174 |
+
logger.warning(f"No content fetched for Patent ID: {patent_number}")
|
| 175 |
+
return patent_data # Return the dictionary with default error messages.
|
| 176 |
+
|
| 177 |
+
try:
|
| 178 |
+
# Let BeautifulSoup handle the decoding from raw bytes.
|
| 179 |
+
soup = BeautifulSoup(raw_content, 'html.parser')
|
| 180 |
+
|
| 181 |
+
# --- Extract Title ---
|
| 182 |
+
title_tag = soup.find('meta', attrs={'name': 'DC.title'})
|
| 183 |
+
if title_tag and title_tag.get('content'):
|
| 184 |
+
patent_data["title"] = title_tag['content'].strip()
|
| 185 |
else:
|
| 186 |
+
# Fallback to finding the title in an <h1> tag.
|
| 187 |
+
title_h1 = soup.find('h1', id='title')
|
| 188 |
+
if title_h1:
|
| 189 |
+
patent_data["title"] = title_h1.get_text(strip=True)
|
| 190 |
+
|
| 191 |
+
# --- Extract Description ---
|
| 192 |
+
description_section = soup.find('section', itemprop='description')
|
| 193 |
+
if description_section:
|
| 194 |
+
# Remove unnecessary nested spans to clean the output.
|
| 195 |
+
for src_text in description_section.find_all('span', class_='google-src-text'):
|
| 196 |
+
src_text.decompose()
|
| 197 |
+
patent_data["description"] = description_section.get_text(separator=' ', strip=True)
|
| 198 |
+
|
| 199 |
+
# --- Extract Claims ---
|
| 200 |
+
claims_section = soup.find('section', itemprop='claims')
|
| 201 |
+
if claims_section:
|
| 202 |
+
# Remove unnecessary nested spans here as well.
|
| 203 |
+
for src_text in claims_section.find_all('span', class_='google-src-text'):
|
| 204 |
+
src_text.decompose()
|
| 205 |
+
patent_data["claim"] = claims_section.get_text(separator=' ', strip=True)
|
| 206 |
+
|
| 207 |
+
# Update status message if specific sections were not found on the page.
|
| 208 |
+
if patent_data["title"] == "Error fetching content or content not found":
|
| 209 |
+
patent_data["title"] = "Title not found on page"
|
| 210 |
+
if patent_data["description"] == "Error fetching content or content not found":
|
| 211 |
+
patent_data["description"] = "Description not found on page"
|
| 212 |
+
if patent_data["claim"] == "Error fetching content or content not found":
|
| 213 |
+
patent_data["claim"] = "Claim not found on page"
|
| 214 |
|
| 215 |
except Exception as e:
|
| 216 |
+
# Catch any unexpected errors during the parsing process.
|
| 217 |
+
logger.error(f"Failed to parse content for Patent ID {patent_number}: {e}")
|
| 218 |
+
|
| 219 |
+
# Generate summary with Gemini API if available and abstract exists
|
| 220 |
+
if rp_data["description"] and \
|
| 221 |
+
not rp_data["description"].startswith("Error fetching content") and \
|
| 222 |
+
not rp_data["description"].startswith("Description not found"):
|
| 223 |
+
|
| 224 |
+
prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
|
| 225 |
+
Focus on challenges, gaps, or novel aspects.
|
| 226 |
+
Here is the document: <document>{rp_data['abstract']}<document>"""
|
| 227 |
+
|
| 228 |
+
try:
|
| 229 |
+
model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
|
| 230 |
+
response = model.generate_content(prompt)
|
| 231 |
+
|
| 232 |
+
rp_data["summary"] = response.text
|
| 233 |
+
logger.info(f"Summary generated for Patent ID: {patent_number}")
|
| 234 |
+
except Exception as e:
|
| 235 |
+
logger.error(f"Error generating summary with Gemini for Patent ID {patent_number}: {e}")
|
| 236 |
+
rp_data["summary"] = "Error generating summary (API failure)"
|
| 237 |
+
else:
|
| 238 |
+
rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
|
| 239 |
+
return patent_data
|
| 240 |
|
| 241 |
def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
|
| 242 |
"""Adds a list of nodes to Neo4j in a single transaction."""
|