Spaces:
Sleeping
Sleeping
Update api.py
Browse files
api.py
CHANGED
|
@@ -8,48 +8,53 @@ import logging # Import logging module
|
|
| 8 |
|
| 9 |
# --- Logging Configuration ---
|
| 10 |
# Basic logger configuration to display INFO messages and above.
|
| 11 |
-
|
| 12 |
-
logging.basicConfig(
|
| 13 |
-
level=logging.INFO,
|
| 14 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 15 |
-
handlers=[
|
| 16 |
-
logging.StreamHandler() # Display logs in the console (stderr by default)
|
| 17 |
-
# You could add a logging.FileHandler("app.log") here to write to a file
|
| 18 |
-
]
|
| 19 |
-
)
|
| 20 |
logger = logging.getLogger(__name__) # Create a logger instance for this module
|
| 21 |
|
| 22 |
# --- Environment Variable Configuration ---
|
| 23 |
NEO4J_URI = os.getenv("NEO4J_URI")
|
| 24 |
NEO4J_USER = os.getenv("NEO4J_USER")
|
| 25 |
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
|
| 26 |
-
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 27 |
|
| 28 |
# Validation of essential configurations
|
| 29 |
if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
|
| 30 |
logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.")
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# Initialize FastAPI application
|
| 35 |
app = FastAPI(
|
| 36 |
-
title="
|
| 37 |
-
description="API to fetch
|
| 38 |
version="1.0.0"
|
| 39 |
)
|
| 40 |
|
| 41 |
-
# --- Gemini API Client Initialization ---
|
| 42 |
-
gemini_model = None
|
| 43 |
-
if GEMINI_API_KEY:
|
| 44 |
-
try:
|
| 45 |
-
genai.configure(api_key=GEMINI_API_KEY)
|
| 46 |
-
gemini_model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-05-20") # Specified model
|
| 47 |
-
logger.info("Gemini API client initialized successfully.")
|
| 48 |
-
except Exception as e:
|
| 49 |
-
logger.warning(f"WARNING: Failed to initialize Gemini API client: {e}. Summary generation will be affected.")
|
| 50 |
-
else:
|
| 51 |
-
logger.warning("WARNING: GEMINI_API_KEY environment variable not set. Summary generation will be disabled.")
|
| 52 |
-
|
| 53 |
# --- Utility Functions (Adapted from your script) ---
|
| 54 |
|
| 55 |
def get_content(number: str, node_type: str) -> str:
|
|
@@ -63,9 +68,9 @@ def get_content(number: str, node_type: str) -> str:
|
|
| 63 |
if not url:
|
| 64 |
logger.warning(f"Unknown node type: {node_type} for number {number}")
|
| 65 |
return ""
|
| 66 |
-
|
| 67 |
try:
|
| 68 |
-
response = requests.get(url
|
| 69 |
response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
|
| 70 |
return response.content.decode('utf-8', errors='replace').replace("\n", "")
|
| 71 |
except requests.exceptions.RequestException as e:
|
|
@@ -81,10 +86,9 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
| 81 |
|
| 82 |
rp_data = {
|
| 83 |
"document": f"Arxiv {rp_number}", # ID for the paper
|
| 84 |
-
"arxiv_id": rp_number,
|
| 85 |
"title": "Error fetching content or content not found",
|
| 86 |
"abstract": "Error fetching content or content not found",
|
| 87 |
-
"summary": "Summary not generated" # Default summary
|
| 88 |
}
|
| 89 |
|
| 90 |
if not raw_content:
|
|
@@ -97,9 +101,9 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
| 97 |
# Extract Title
|
| 98 |
title_tag = soup.find('h1', class_='title')
|
| 99 |
if title_tag and title_tag.find('span', class_='descriptor'):
|
| 100 |
-
|
| 101 |
-
if
|
| 102 |
-
rp_data["title"] =
|
| 103 |
else:
|
| 104 |
rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()
|
| 105 |
elif title_tag : # Fallback if the span descriptor is not there but h1.title exists
|
|
@@ -116,7 +120,6 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
| 116 |
if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':':
|
| 117 |
prefix_end += 1 # Include the colon in removal
|
| 118 |
abstract_text = abstract_text[prefix_end:].strip()
|
| 119 |
-
|
| 120 |
rp_data["abstract"] = abstract_text
|
| 121 |
|
| 122 |
# Mark if title or abstract are still not found
|
|
@@ -126,62 +129,71 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
| 126 |
rp_data["abstract"] = "Abstract not found on page"
|
| 127 |
|
| 128 |
# Generate summary with Gemini API if available and abstract exists
|
| 129 |
-
if
|
| 130 |
not rp_data["abstract"].startswith("Error fetching content") and \
|
| 131 |
not rp_data["abstract"].startswith("Abstract not found"):
|
| 132 |
-
|
| 133 |
prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
|
| 134 |
Focus on challenges, gaps, or novel aspects.
|
| 135 |
Here is the document: <document>{rp_data['abstract']}<document>"""
|
| 136 |
|
| 137 |
try:
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
| 139 |
rp_data["summary"] = response.text
|
| 140 |
logger.info(f"Summary generated for Arxiv ID: {rp_number}")
|
| 141 |
except Exception as e:
|
| 142 |
logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
|
| 143 |
rp_data["summary"] = "Error generating summary (API failure)"
|
| 144 |
-
elif not gemini_model:
|
| 145 |
-
rp_data["summary"] = "Summary not generated (Gemini API client not available)"
|
| 146 |
else:
|
| 147 |
rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
|
| 148 |
|
| 149 |
except Exception as e:
|
| 150 |
logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}")
|
| 151 |
-
|
| 152 |
return rp_data
|
| 153 |
|
| 154 |
-
def add_nodes_to_neo4j(driver, data_list: list,
|
| 155 |
"""Adds a list of nodes to Neo4j in a single transaction."""
|
| 156 |
if not data_list:
|
| 157 |
logger.warning("No data provided to add_nodes_to_neo4j.")
|
| 158 |
return 0
|
| 159 |
|
| 160 |
query = (
|
| 161 |
-
|
| 162 |
-
f"
|
| 163 |
-
|
| 164 |
-
f"ON MATCH SET n += properties" # Update properties if the node already exists
|
| 165 |
)
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
try:
|
| 168 |
with driver.session(database="neo4j") as session: # Specify database if not default
|
| 169 |
result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume())
|
| 170 |
nodes_created = result.counters.nodes_created
|
| 171 |
|
| 172 |
if nodes_created > 0:
|
| 173 |
-
logger.info(f"{nodes_created} new {
|
| 174 |
|
| 175 |
summary = result.summary
|
| 176 |
-
logger.info(f"
|
| 177 |
-
|
| 178 |
return nodes_created # Return the number of nodes actually created
|
| 179 |
except Exception as e:
|
| 180 |
-
logger.error(f"Neo4j Error - Failed to add/update {
|
| 181 |
raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}")
|
| 182 |
|
| 183 |
|
| 184 |
# --- FastAPI Endpoint ---
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
@app.post("/add_research_paper/{arxiv_id}", status_code=201) # 201 Created for successful creation
|
| 187 |
async def add_single_research_paper(arxiv_id: str):
|
|
@@ -214,13 +226,8 @@ async def add_single_research_paper(arxiv_id: str):
|
|
| 214 |
nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type)
|
| 215 |
|
| 216 |
if nodes_created_count > 0 :
|
| 217 |
-
message = f"Research paper {arxiv_id} was successfully added to Neo4j."
|
| 218 |
status_code_response = 201 # Created
|
| 219 |
-
else:
|
| 220 |
-
# If MERGE found an existing node and updated it, nodes_created_count will be 0.
|
| 221 |
-
# This is considered a success (idempotency).
|
| 222 |
-
message = f"Research paper {arxiv_id} was processed (potentially updated if it already existed)."
|
| 223 |
-
status_code_response = 200 # OK (because no new creation, but operation successful)
|
| 224 |
|
| 225 |
logger.info(message)
|
| 226 |
# Note: FastAPI uses the status_code from the decorator or HTTPException.
|
|
|
|
| 8 |
|
| 9 |
# --- Logging Configuration ---
|
| 10 |
# Basic logger configuration to display INFO messages and above.
|
| 11 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
logger = logging.getLogger(__name__) # Create a logger instance for this module
|
| 13 |
|
| 14 |
# --- Environment Variable Configuration ---
|
| 15 |
NEO4J_URI = os.getenv("NEO4J_URI")
|
| 16 |
NEO4J_USER = os.getenv("NEO4J_USER")
|
| 17 |
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
|
|
|
|
| 18 |
|
| 19 |
# Validation of essential configurations
|
| 20 |
if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
|
| 21 |
logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.")
|
| 22 |
+
|
| 23 |
+
# --- Application Lifecycle (Startup/Shutdown) ---
|
| 24 |
+
@asynccontextmanager
|
| 25 |
+
async def lifespan(app: FastAPI):
|
| 26 |
+
"""Handles startup and shutdown events."""
|
| 27 |
+
# Initialize Gemini Client
|
| 28 |
+
logger.info("Initializing Gemini client...")
|
| 29 |
+
if genai:
|
| 30 |
+
try:
|
| 31 |
+
# Assuming GEMINI_API_KEY is set in environment or loaded via settings
|
| 32 |
+
api_key = os.getenv("GEMINI_API_KEY") or getattr(settings, "GEMINI_API_KEY", None)
|
| 33 |
+
if not api_key:
|
| 34 |
+
raise ValueError("GEMINI_API_KEY not found in environment or settings.")
|
| 35 |
+
genai.configure(api_key=api_key)
|
| 36 |
+
logger.info("Gemini client configured successfully.")
|
| 37 |
+
except Exception as e:
|
| 38 |
+
logger.error(f"Failed to configure Gemini client: {e}", exc_info=True)
|
| 39 |
+
else:
|
| 40 |
+
logger.warning("Gemini library not imported. Endpoints requiring Gemini will not work.")
|
| 41 |
+
|
| 42 |
+
yield # API runs here
|
| 43 |
+
|
| 44 |
+
# --- Shutdown ---
|
| 45 |
+
logger.info("API shutting down...")
|
| 46 |
+
# Close Neo4j connection (handled by atexit in graph_client.py)
|
| 47 |
+
# neo4j_client.close() # Usually not needed due to atexit registration
|
| 48 |
+
logger.info("Neo4j client closed (likely via atexit).")
|
| 49 |
+
logger.info("API shutdown complete.")
|
| 50 |
|
| 51 |
# Initialize FastAPI application
|
| 52 |
app = FastAPI(
|
| 53 |
+
title="Neo4j Importer",
|
| 54 |
+
description="API to fetch documents, summarize it with Gemini, and add it to Neo4j.",
|
| 55 |
version="1.0.0"
|
| 56 |
)
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
# --- Utility Functions (Adapted from your script) ---
|
| 59 |
|
| 60 |
def get_content(number: str, node_type: str) -> str:
|
|
|
|
| 68 |
if not url:
|
| 69 |
logger.warning(f"Unknown node type: {node_type} for number {number}")
|
| 70 |
return ""
|
| 71 |
+
|
| 72 |
try:
|
| 73 |
+
response = requests.get(url)
|
| 74 |
response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
|
| 75 |
return response.content.decode('utf-8', errors='replace').replace("\n", "")
|
| 76 |
except requests.exceptions.RequestException as e:
|
|
|
|
| 86 |
|
| 87 |
rp_data = {
|
| 88 |
"document": f"Arxiv {rp_number}", # ID for the paper
|
|
|
|
| 89 |
"title": "Error fetching content or content not found",
|
| 90 |
"abstract": "Error fetching content or content not found",
|
| 91 |
+
"summary": "Summary not yet generated" # Default summary
|
| 92 |
}
|
| 93 |
|
| 94 |
if not raw_content:
|
|
|
|
| 101 |
# Extract Title
|
| 102 |
title_tag = soup.find('h1', class_='title')
|
| 103 |
if title_tag and title_tag.find('span', class_='descriptor'):
|
| 104 |
+
title_text = title_tag.find('span', class_='descriptor').next_sibling
|
| 105 |
+
if title_text and isinstance(title_text, str):
|
| 106 |
+
rp_data["title"] = title_text.strip()
|
| 107 |
else:
|
| 108 |
rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()
|
| 109 |
elif title_tag : # Fallback if the span descriptor is not there but h1.title exists
|
|
|
|
| 120 |
if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':':
|
| 121 |
prefix_end += 1 # Include the colon in removal
|
| 122 |
abstract_text = abstract_text[prefix_end:].strip()
|
|
|
|
| 123 |
rp_data["abstract"] = abstract_text
|
| 124 |
|
| 125 |
# Mark if title or abstract are still not found
|
|
|
|
| 129 |
rp_data["abstract"] = "Abstract not found on page"
|
| 130 |
|
| 131 |
# Generate summary with Gemini API if available and abstract exists
|
| 132 |
+
if rp_data["abstract"] and \
|
| 133 |
not rp_data["abstract"].startswith("Error fetching content") and \
|
| 134 |
not rp_data["abstract"].startswith("Abstract not found"):
|
| 135 |
+
|
| 136 |
prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
|
| 137 |
Focus on challenges, gaps, or novel aspects.
|
| 138 |
Here is the document: <document>{rp_data['abstract']}<document>"""
|
| 139 |
|
| 140 |
try:
|
| 141 |
+
model_name = "gemini-2.5-flash-preview-05-20"
|
| 142 |
+
model = genai.GenerativeModel(model_name)
|
| 143 |
+
|
| 144 |
+
response = model.generate_content(prompt)
|
| 145 |
rp_data["summary"] = response.text
|
| 146 |
logger.info(f"Summary generated for Arxiv ID: {rp_number}")
|
| 147 |
except Exception as e:
|
| 148 |
logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
|
| 149 |
rp_data["summary"] = "Error generating summary (API failure)"
|
|
|
|
|
|
|
| 150 |
else:
|
| 151 |
rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
|
| 152 |
|
| 153 |
except Exception as e:
|
| 154 |
logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}")
|
|
|
|
| 155 |
return rp_data
|
| 156 |
|
| 157 |
+
def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
|
| 158 |
"""Adds a list of nodes to Neo4j in a single transaction."""
|
| 159 |
if not data_list:
|
| 160 |
logger.warning("No data provided to add_nodes_to_neo4j.")
|
| 161 |
return 0
|
| 162 |
|
| 163 |
query = (
|
| 164 |
+
"UNWIND $data as properties"
|
| 165 |
+
f"CREATE (n:{node_type})"
|
| 166 |
+
"SET n = properties"
|
|
|
|
| 167 |
)
|
| 168 |
|
| 169 |
+
# query = (
|
| 170 |
+
# f"UNWIND $data as properties "
|
| 171 |
+
# f"MERGE (n:{node_type} {{arxiv_id: properties.arxiv_id}}) " # Use MERGE for idempotency
|
| 172 |
+
# f"ON CREATE SET n = properties "
|
| 173 |
+
# f"ON MATCH SET n += properties" # Update properties if the node already exists
|
| 174 |
+
# )
|
| 175 |
+
|
| 176 |
try:
|
| 177 |
with driver.session(database="neo4j") as session: # Specify database if not default
|
| 178 |
result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume())
|
| 179 |
nodes_created = result.counters.nodes_created
|
| 180 |
|
| 181 |
if nodes_created > 0:
|
| 182 |
+
logger.info(f"{nodes_created} new {node_type} node(s) added successfully.")
|
| 183 |
|
| 184 |
summary = result.summary
|
| 185 |
+
logger.info(f"CREATE operation for {node_type}: {summary.counters.nodes_created} created, {summary.counters.properties_set} properties affected.")
|
|
|
|
| 186 |
return nodes_created # Return the number of nodes actually created
|
| 187 |
except Exception as e:
|
| 188 |
+
logger.error(f"Neo4j Error - Failed to add/update {node_type} nodes: {e}")
|
| 189 |
raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}")
|
| 190 |
|
| 191 |
|
| 192 |
# --- FastAPI Endpoint ---
|
| 193 |
+
# API state check route
|
| 194 |
+
@app.get("/")
|
| 195 |
+
def read_root():
|
| 196 |
+
return {"status": "ok"}
|
| 197 |
|
| 198 |
@app.post("/add_research_paper/{arxiv_id}", status_code=201) # 201 Created for successful creation
|
| 199 |
async def add_single_research_paper(arxiv_id: str):
|
|
|
|
| 226 |
nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type)
|
| 227 |
|
| 228 |
if nodes_created_count > 0 :
|
| 229 |
+
logger.info(message = f"Research paper {arxiv_id} was successfully added to Neo4j.")
|
| 230 |
status_code_response = 201 # Created
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
logger.info(message)
|
| 233 |
# Note: FastAPI uses the status_code from the decorator or HTTPException.
|