Spaces:

OrganizedProgrammers
/

FastAPI_Neo4j

Sleeping

App Files Files Community

adrienbrdne commited on Jun 5

Commit

54f5fbc

verified ·

1 Parent(s): fbf2452

Update api.py

Browse files

Files changed (1) hide show

api.py +62 -55

api.py CHANGED Viewed

@@ -8,48 +8,53 @@ import logging # Import logging module
 # --- Logging Configuration ---
 # Basic logger configuration to display INFO messages and above.
-# The format includes timestamp, log level, and message.
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.StreamHandler() # Display logs in the console (stderr by default)
-        # You could add a logging.FileHandler("app.log") here to write to a file
-    ]
-)
 logger = logging.getLogger(__name__) # Create a logger instance for this module
 # --- Environment Variable Configuration ---
 NEO4J_URI = os.getenv("NEO4J_URI")
 NEO4J_USER = os.getenv("NEO4J_USER")
 NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
-GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 # Validation of essential configurations
 if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
     logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.")
-    # In a real application, you might want to exit or prevent FastAPI from starting.
-    # For now, we let the application try and fail at runtime if they are missing.
 # Initialize FastAPI application
 app = FastAPI(
-    title="Arxiv to Neo4j Importer",
-    description="API to fetch research paper data from Arxiv, summarize it with Gemini, and add it to Neo4j.",
     version="1.0.0"
 )
-# --- Gemini API Client Initialization ---
-gemini_model = None
-if GEMINI_API_KEY:
-    try:
-        genai.configure(api_key=GEMINI_API_KEY)
-        gemini_model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-05-20") # Specified model
-        logger.info("Gemini API client initialized successfully.")
-    except Exception as e:
-        logger.warning(f"WARNING: Failed to initialize Gemini API client: {e}. Summary generation will be affected.")
-else:
-    logger.warning("WARNING: GEMINI_API_KEY environment variable not set. Summary generation will be disabled.")
 # --- Utility Functions (Adapted from your script) ---
 def get_content(number: str, node_type: str) -> str:
@@ -63,9 +68,9 @@ def get_content(number: str, node_type: str) -> str:
     if not url:
         logger.warning(f"Unknown node type: {node_type} for number {number}")
         return ""
     try:
-        response = requests.get(url, timeout=10) # Added a timeout
         response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
         return response.content.decode('utf-8', errors='replace').replace("\n", "")
     except requests.exceptions.RequestException as e:
@@ -81,10 +86,9 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
     rp_data = {
         "document": f"Arxiv {rp_number}", # ID for the paper
-        "arxiv_id": rp_number,
         "title": "Error fetching content or content not found",
         "abstract": "Error fetching content or content not found",
-        "summary": "Summary not generated" # Default summary
     }
     if not raw_content:
@@ -97,9 +101,9 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
         # Extract Title
         title_tag = soup.find('h1', class_='title')
         if title_tag and title_tag.find('span', class_='descriptor'):
-            title_text_candidate = title_tag.find('span', class_='descriptor').next_sibling
-            if title_text_candidate and isinstance(title_text_candidate, str):
-                 rp_data["title"] = title_text_candidate.strip()
             else:
                 rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()
         elif title_tag : # Fallback if the span descriptor is not there but h1.title exists
@@ -116,7 +120,6 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
                 if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':':
                     prefix_end += 1 # Include the colon in removal
                 abstract_text = abstract_text[prefix_end:].strip()
             rp_data["abstract"] = abstract_text
         # Mark if title or abstract are still not found
@@ -126,62 +129,71 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
             rp_data["abstract"] = "Abstract not found on page"
         # Generate summary with Gemini API if available and abstract exists
-        if gemini_model and rp_data["abstract"] and \
            not rp_data["abstract"].startswith("Error fetching content") and \
            not rp_data["abstract"].startswith("Abstract not found"):
-            # English prompt for Gemini
             prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
             Focus on challenges, gaps, or novel aspects.
             Here is the document: <document>{rp_data['abstract']}<document>"""
             try:
-                response = gemini_model.generate_content(prompt)
                 rp_data["summary"] = response.text
                 logger.info(f"Summary generated for Arxiv ID: {rp_number}")
             except Exception as e:
                 logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
                 rp_data["summary"] = "Error generating summary (API failure)"
-        elif not gemini_model:
-            rp_data["summary"] = "Summary not generated (Gemini API client not available)"
         else:
             rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
     except Exception as e:
         logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}")
     return rp_data
-def add_nodes_to_neo4j(driver, data_list: list, node_label: str):
     """Adds a list of nodes to Neo4j in a single transaction."""
     if not data_list:
         logger.warning("No data provided to add_nodes_to_neo4j.")
         return 0
     query = (
-        f"UNWIND $data as properties "
-        f"MERGE (n:{node_label} {{arxiv_id: properties.arxiv_id}}) " # Use MERGE for idempotency
-        f"ON CREATE SET n = properties "
-        f"ON MATCH SET n += properties" # Update properties if the node already exists
     )
     try:
         with driver.session(database="neo4j") as session: # Specify database if not default
             result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume())
             nodes_created = result.counters.nodes_created
             if nodes_created > 0:
-                logger.info(f"{nodes_created} new {node_label} node(s) added successfully.")
             summary = result.summary
-            logger.info(f"MERGE operation for {node_label}: {summary.counters.nodes_created} created, {summary.counters.properties_set} properties affected.")
             return nodes_created # Return the number of nodes actually created
     except Exception as e:
-        logger.error(f"Neo4j Error - Failed to add/update {node_label} nodes: {e}")
         raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}")
 # --- FastAPI Endpoint ---
 @app.post("/add_research_paper/{arxiv_id}", status_code=201) # 201 Created for successful creation
 async def add_single_research_paper(arxiv_id: str):
@@ -214,13 +226,8 @@ async def add_single_research_paper(arxiv_id: str):
         nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type)
         if nodes_created_count > 0 :
-            message = f"Research paper {arxiv_id} was successfully added to Neo4j."
             status_code_response = 201 # Created
-        else:
-            # If MERGE found an existing node and updated it, nodes_created_count will be 0.
-            # This is considered a success (idempotency).
-            message = f"Research paper {arxiv_id} was processed (potentially updated if it already existed)."
-            status_code_response = 200 # OK (because no new creation, but operation successful)
         logger.info(message)
         # Note: FastAPI uses the status_code from the decorator or HTTPException.

 # --- Logging Configuration ---
 # Basic logger configuration to display INFO messages and above.
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__) # Create a logger instance for this module
 # --- Environment Variable Configuration ---
 NEO4J_URI = os.getenv("NEO4J_URI")
 NEO4J_USER = os.getenv("NEO4J_USER")
 NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
 # Validation of essential configurations
 if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
     logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.")
+# --- Application Lifecycle (Startup/Shutdown) ---
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Handles startup and shutdown events."""
+    # Initialize Gemini Client
+    logger.info("Initializing Gemini client...")
+    if genai:
+        try:
+            # Assuming GEMINI_API_KEY is set in environment or loaded via settings
+            api_key = os.getenv("GEMINI_API_KEY") or getattr(settings, "GEMINI_API_KEY", None)
+            if not api_key:
+                 raise ValueError("GEMINI_API_KEY not found in environment or settings.")
+            genai.configure(api_key=api_key)
+            logger.info("Gemini client configured successfully.")
+        except Exception as e:
+            logger.error(f"Failed to configure Gemini client: {e}", exc_info=True)
+    else:
+        logger.warning("Gemini library not imported. Endpoints requiring Gemini will not work.")
+    yield # API runs here
+    # --- Shutdown ---
+    logger.info("API shutting down...")
+    # Close Neo4j connection (handled by atexit in graph_client.py)
+    # neo4j_client.close() # Usually not needed due to atexit registration
+    logger.info("Neo4j client closed (likely via atexit).")
+    logger.info("API shutdown complete.")
 # Initialize FastAPI application
 app = FastAPI(
+    title="Neo4j Importer",
+    description="API to fetch documents, summarize it with Gemini, and add it to Neo4j.",
     version="1.0.0"
 )
 # --- Utility Functions (Adapted from your script) ---
 def get_content(number: str, node_type: str) -> str:
     if not url:
         logger.warning(f"Unknown node type: {node_type} for number {number}")
         return ""
     try:
+        response = requests.get(url)
         response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
         return response.content.decode('utf-8', errors='replace').replace("\n", "")
     except requests.exceptions.RequestException as e:
     rp_data = {
         "document": f"Arxiv {rp_number}", # ID for the paper
         "title": "Error fetching content or content not found",
         "abstract": "Error fetching content or content not found",
+        "summary": "Summary not yet generated" # Default summary
     }
     if not raw_content:
         # Extract Title
         title_tag = soup.find('h1', class_='title')
         if title_tag and title_tag.find('span', class_='descriptor'):
+            title_text = title_tag.find('span', class_='descriptor').next_sibling
+            if title_text and isinstance(title_text, str):
+                 rp_data["title"] = title_text.strip()
             else:
                 rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()
         elif title_tag : # Fallback if the span descriptor is not there but h1.title exists
                 if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':':
                     prefix_end += 1 # Include the colon in removal
                 abstract_text = abstract_text[prefix_end:].strip()
             rp_data["abstract"] = abstract_text
         # Mark if title or abstract are still not found
             rp_data["abstract"] = "Abstract not found on page"
         # Generate summary with Gemini API if available and abstract exists
+        if rp_data["abstract"] and \
            not rp_data["abstract"].startswith("Error fetching content") and \
            not rp_data["abstract"].startswith("Abstract not found"):
             prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
             Focus on challenges, gaps, or novel aspects.
             Here is the document: <document>{rp_data['abstract']}<document>"""
             try:
+                model_name = "gemini-2.5-flash-preview-05-20"
+                model = genai.GenerativeModel(model_name)
+                response = model.generate_content(prompt)
                 rp_data["summary"] = response.text
                 logger.info(f"Summary generated for Arxiv ID: {rp_number}")
             except Exception as e:
                 logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
                 rp_data["summary"] = "Error generating summary (API failure)"
         else:
             rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
     except Exception as e:
         logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}")
     return rp_data
+def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
     """Adds a list of nodes to Neo4j in a single transaction."""
     if not data_list:
         logger.warning("No data provided to add_nodes_to_neo4j.")
         return 0
     query = (
+        "UNWIND $data as properties"
+        f"CREATE (n:{node_type})"
+        "SET n = properties"
     )
+    # query = (
+    #     f"UNWIND $data as properties "
+    #     f"MERGE (n:{node_type} {{arxiv_id: properties.arxiv_id}}) " # Use MERGE for idempotency
+    #     f"ON CREATE SET n = properties "
+    #     f"ON MATCH SET n += properties" # Update properties if the node already exists
+    # )
     try:
         with driver.session(database="neo4j") as session: # Specify database if not default
             result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume())
             nodes_created = result.counters.nodes_created
             if nodes_created > 0:
+                logger.info(f"{nodes_created} new {node_type} node(s) added successfully.")
             summary = result.summary
+            logger.info(f"CREATE operation for {node_type}: {summary.counters.nodes_created} created, {summary.counters.properties_set} properties affected.")
             return nodes_created # Return the number of nodes actually created
     except Exception as e:
+        logger.error(f"Neo4j Error - Failed to add/update {node_type} nodes: {e}")
         raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}")
 # --- FastAPI Endpoint ---
+# API state check route
+@app.get("/")
+def read_root():
+    return {"status": "ok"}
 @app.post("/add_research_paper/{arxiv_id}", status_code=201) # 201 Created for successful creation
 async def add_single_research_paper(arxiv_id: str):
         nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type)
         if nodes_created_count > 0 :
+            logger.info(message = f"Research paper {arxiv_id} was successfully added to Neo4j.")
             status_code_response = 201 # Created
         logger.info(message)
         # Note: FastAPI uses the status_code from the decorator or HTTPException.