Spaces:
Sleeping
Sleeping
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| import nltk | |
| from nltk.stem import * | |
| nltk.download("punkt_tab") | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| import os | |
| import google.generativeai as genai | |
| import json | |
| from google.genai import Client, types | |
| from datasets import load_dataset | |
| def set_prompt(problem): | |
| prompt = """ | |
| # ROLE | |
| You are a meticulous senior technical analyst and constraints scout. Your task is to read a small description of a technical problem and identify distinct constraints each related to the problem and ensuring that the whole problem is encompassed by each constraints into a JSON object. | |
| # OBJECTIVE | |
| Find all the constraints in this technical problem making sure each are premised on the problem only. | |
| Take into account different technical domains to encompass the whole problem. | |
| Output each constraints in a JSON such as : {"title of the constraints1":"description1","title of the constraintsN":"descriptionN"} | |
| # INSTRUCTIONS & RULES | |
| 1. **JSON Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list. Do not include any explanatory text before or after the JSON. | |
| 2 **Discover and Iterate**: Your primary task is to scan the technical problem, find each constraints and create a seperate entry for it in the output JSON. | |
| 3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the constraints's issues. Do not use single keywords. These descriptions should be based on the information in the technical problem. | |
| 4. **Infer Where Necessary**: The technical problem may not contain all details. Infer plausible information based on the context. | |
| # JSON SCHEMA & EXAMPLE | |
| { | |
| 'Exposing Compute Resources': 'The 6G network shall provide suitable APIs to allow authorized third parties and/or UEs to retrieve availability information about computational resources inside the Service Hosting Environment (SHE) and to utilize these computational resources for running workloads on demand.', | |
| 'Providing AI Compute': 'The 6G network shall be able to provide computing resources in the Service Hosting Environment for AI services and provide AI services to UEs.', | |
| ... | |
| } | |
| --- | |
| ***NOW, BEGIN THE TASK.*** | |
| # TECHNICAL PROBLEM | |
| """ + problem | |
| return prompt | |
| def load_data(): | |
| return load_dataset("heymenn/Technologies", split="train") | |
| def stem(data,data_type): | |
| stemmer = SnowballStemmer("english") | |
| processed_data = [] | |
| if data_type == "technologies": | |
| for index, t_item in enumerate(data): | |
| processed_data.append({ | |
| "name": stemmer.stem(t_item["name"]), | |
| "purpose": stemmer.stem(t_item["purpose"]), | |
| "problem_types_solved": stemmer.stem(t_item["problem_types_solved"]), | |
| "advantages": stemmer.stem(t_item["advantages"]), | |
| "limitations": stemmer.stem(t_item["limitations"]), | |
| "domain_tags": stemmer.stem(t_item["domain_tags"]), | |
| "id": index | |
| }) | |
| else: | |
| for t_item in data: | |
| processed_data.append({ | |
| "title": stemmer.stem(t_item), | |
| "description": stemmer.stem(data[t_item]) | |
| }) | |
| return processed_data | |
| def get_technologies_by_id(technologies,dataset): | |
| result = [] | |
| for id in technologies: | |
| print(id) | |
| data = dataset[id] | |
| del data["embeddings"] | |
| print(data) | |
| result.append(data) | |
| return result | |
| def save_to_pickle(result_similarites): | |
| constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites]))) | |
| max_id2 = max([item['id2'] for item in result_similarites]) | |
| row_label_to_index = {title: i for i, title in enumerate(constraint_titles)} | |
| col_labels = list(range(1, max_id2 + 1)) | |
| num_rows = len(constraint_titles) | |
| num_cols = max_id2 | |
| matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32) | |
| for item in result_similarites: | |
| row_idx = row_label_to_index[item['constraint']['title']] | |
| col_idx = item['id2'] - 1 | |
| similarity_value = item['similarity'].item() | |
| matrix[row_idx, col_idx] = similarity_value | |
| print(f"Successfully created matrix with shape: {matrix.shape}") | |
| print(f"Number of rows (unique constraints): {num_rows}") | |
| print(f"Number of columns (max id2): {num_cols}") | |
| print("\nExample 5x5 block of the created matrix (NaN for missing values):") | |
| print(matrix[:5, :5]) | |
| output_filename = "cosine_similarity_matrix_with_labels.pkl" | |
| data_to_save = { | |
| 'matrix': matrix, | |
| 'row_labels': constraint_titles, | |
| 'col_labels': col_labels | |
| } | |
| with open(output_filename, 'wb') as f: | |
| pickle.dump(data_to_save, f) | |
| print(f"\nMatrix and labels saved to {output_filename}") | |
| return output_filename | |
| def set_gemini(): | |
| gemini_api = os.getenv("GEMINI_API") | |
| client = Client(api_key=gemini_api) | |
| # Define the grounding tool | |
| grounding_tool = types.Tool( | |
| google_search=types.GoogleSearch() | |
| ) | |
| # Configure generation settings | |
| config = types.GenerateContentConfig( | |
| tools=[grounding_tool] | |
| ) | |
| return client,config |