Spaces:

OrganizedProgrammers
/

FastAPI_Neo4j

Sleeping

App Files Files Community

adrienbrdne commited on Jun 10

Commit

12ab024

verified ·

1 Parent(s): 9c1f7cf

Update api.py

Browse files

Files changed (1) hide show

api.py +107 -22

api.py CHANGED Viewed

@@ -80,9 +80,8 @@ def get_content(number: str, node_type: str) -> str:
         logger.error(f"An unexpected error occurred in get_content for {number}: {e}")
         return ""
-def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
     """Extracts information from an Arxiv research paper and generates a summary."""
-    raw_content = get_content(rp_number, node_type)
     rp_data = {
         "document": f"Arxiv {rp_number}", # ID for the paper
@@ -91,6 +90,8 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
         "summary": "Summary not yet generated" # Default summary
     }
     if not raw_content:
         logger.warning(f"No content fetched for Arxiv ID: {rp_number}")
         return rp_data # Returns default error data
@@ -128,30 +129,114 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
         if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag:
             rp_data["abstract"] = "Abstract not found on page"
-        # Generate summary with Gemini API if available and abstract exists
-        if rp_data["abstract"] and \
-           not rp_data["abstract"].startswith("Error fetching content") and \
-           not rp_data["abstract"].startswith("Abstract not found"):
-            prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
-            Focus on challenges, gaps, or novel aspects.
-            Here is the document: <document>{rp_data['abstract']}<document>"""
-            try:
-                model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
-                response = model.generate_content(prompt)
-                rp_data["summary"] = response.text
-                logger.info(f"Summary generated for Arxiv ID: {rp_number}")
-            except Exception as e:
-                logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
-                rp_data["summary"] = "Error generating summary (API failure)"
         else:
-            rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
     except Exception as e:
-        logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}")
-    return rp_data
 def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
     """Adds a list of nodes to Neo4j in a single transaction."""

         logger.error(f"An unexpected error occurred in get_content for {number}: {e}")
         return ""
+def extract_research_paper_arxiv(rp_number: str, node_type: str = "ResearchPaper") -> dict:
     """Extracts information from an Arxiv research paper and generates a summary."""
     rp_data = {
         "document": f"Arxiv {rp_number}", # ID for the paper
         "summary": "Summary not yet generated" # Default summary
     }
+    raw_content = get_content(rp_number, node_type)
     if not raw_content:
         logger.warning(f"No content fetched for Arxiv ID: {rp_number}")
         return rp_data # Returns default error data
         if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag:
             rp_data["abstract"] = "Abstract not found on page"
+    except Exception as e:
+        logger.error(f"Failed to parse content for Arxiv ID {rp_number}: {e}")
+    # Generate summary with Gemini API if available and abstract exists
+    if rp_data["abstract"] and \
+       not rp_data["abstract"].startswith("Error fetching content") and \
+       not rp_data["abstract"].startswith("Abstract not found"):
+        prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
+        Focus on challenges, gaps, or novel aspects.
+        Here is the document: <document>{rp_data['abstract']}<document>"""
+        try:
+            model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
+            response = model.generate_content(prompt)
+            rp_data["summary"] = response.text
+            logger.info(f"Summary generated for Arxiv ID: {rp_number}")
+        except Exception as e:
+            logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
+            rp_data["summary"] = "Error generating summary (API failure)"
+    else:
+        rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
+    return rp_data
+def extract_patent_data(patent_number: str, node_type: str = "Patent"):
+    """
+    Extracts information from a Google Patents page with robust error handling.
+    """
+    # Initialize a dictionary with default error messages for consistency.
+    patent_data = {
+        "document": f"Patent {patent_number}",
+        "title": "Error fetching content or content not found",
+        "description": "Error fetching content or content not found",
+        "claim": "Error fetching content or content not found",
+        "summary": "Summary not yet generated" # Default summary
+    }
+    # Use the generic get_content function to fetch the raw page content.
+    raw_content = get_content(patent_number, node_type)
+    if not raw_content:
+        logger.warning(f"No content fetched for Patent ID: {patent_number}")
+        return patent_data # Return the dictionary with default error messages.
+    try:
+        # Let BeautifulSoup handle the decoding from raw bytes.
+        soup = BeautifulSoup(raw_content, 'html.parser')
+        # --- Extract Title ---
+        title_tag = soup.find('meta', attrs={'name': 'DC.title'})
+        if title_tag and title_tag.get('content'):
+            patent_data["title"] = title_tag['content'].strip()
         else:
+            # Fallback to finding the title in an <h1> tag.
+            title_h1 = soup.find('h1', id='title')
+            if title_h1:
+                patent_data["title"] = title_h1.get_text(strip=True)
+        # --- Extract Description ---
+        description_section = soup.find('section', itemprop='description')
+        if description_section:
+            # Remove unnecessary nested spans to clean the output.
+            for src_text in description_section.find_all('span', class_='google-src-text'):
+                src_text.decompose()
+            patent_data["description"] = description_section.get_text(separator=' ', strip=True)
+        # --- Extract Claims ---
+        claims_section = soup.find('section', itemprop='claims')
+        if claims_section:
+            # Remove unnecessary nested spans here as well.
+            for src_text in claims_section.find_all('span', class_='google-src-text'):
+                src_text.decompose()
+            patent_data["claim"] = claims_section.get_text(separator=' ', strip=True)
+        # Update status message if specific sections were not found on the page.
+        if patent_data["title"] == "Error fetching content or content not found":
+            patent_data["title"] = "Title not found on page"
+        if patent_data["description"] == "Error fetching content or content not found":
+            patent_data["description"] = "Description not found on page"
+        if patent_data["claim"] == "Error fetching content or content not found":
+            patent_data["claim"] = "Claim not found on page"
     except Exception as e:
+        # Catch any unexpected errors during the parsing process.
+        logger.error(f"Failed to parse content for Patent ID {patent_number}: {e}")
+    # Generate summary with Gemini API if available and abstract exists
+    if rp_data["description"] and \
+       not rp_data["description"].startswith("Error fetching content") and \
+       not rp_data["description"].startswith("Description not found"):
+        prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
+        Focus on challenges, gaps, or novel aspects.
+        Here is the document: <document>{rp_data['abstract']}<document>"""
+        try:
+            model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
+            response = model.generate_content(prompt)
+            rp_data["summary"] = response.text
+            logger.info(f"Summary generated for Patent ID: {patent_number}")
+        except Exception as e:
+            logger.error(f"Error generating summary with Gemini for Patent ID {patent_number}: {e}")
+            rp_data["summary"] = "Error generating summary (API failure)"
+    else:
+        rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
+    return patent_data
 def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
     """Adds a list of nodes to Neo4j in a single transaction."""