| import pandas as pd |
| import plotly.express as px |
| import datasets |
|
|
|
|
| class TaskVisualizations: |
| def __init__( |
| self, task_counts_path, selected_task_counts_path, tasks_with_areas_path |
| ): |
| self.tasks_with_areas_df = self.load_tasks_with_areas_df( |
| task_counts_path, tasks_with_areas_path |
| ) |
| self.selected_tasks_with_areas_df = self.load_tasks_with_areas_df( |
| selected_task_counts_path, tasks_with_areas_path |
| ) |
|
|
| @classmethod |
| def load_tasks_with_areas_df( |
| cls, task_counts_path, tasks_with_areas_path="paperswithcode_tasks.csv" |
| ): |
| task_counts_df = datasets.load_dataset( |
| "lambdaofgod/pwc_github_search", data_files=task_counts_path |
| )["train"].to_pandas() |
|
|
| raw_tasks_with_areas_df = datasets.load_dataset( |
| "lambdaofgod/pwc_github_search", data_files=tasks_with_areas_path |
| )["train"].to_pandas() |
| return raw_tasks_with_areas_df.merge(task_counts_df, on="task") |
|
|
| @classmethod |
| def get_topk_merge_others(cls, df, by_col, val_col, k=10, val_threshold=1000): |
| sorted_df = df.copy().sort_values(val_col, ascending=False) |
| topk_dict = ( |
| sorted_df[[by_col, val_col]].set_index(by_col).iloc[:k].to_dict()[val_col] |
| ) |
| print(topk_dict) |
| sorted_df[by_col] = sorted_df[by_col].apply( |
| lambda k: ( |
| k |
| if k in topk_dict.keys() and topk_dict[k] >= val_threshold |
| else "other" |
| ) |
| ) |
| sorted_df = sorted_df.groupby(by_col).agg({val_col: sum}) |
| return sorted_df |
|
|
| @classmethod |
| def get_displayed_tasks_with_areas_df(cls, tasks_with_areas_df, min_task_count): |
| displayed_tasks_with_areas_df = tasks_with_areas_df.dropna().copy() |
| displayed_tasks_with_areas_df["task"] = displayed_tasks_with_areas_df.apply( |
| lambda r: r["task"] if r["count"] >= min_task_count else "other", axis=1 |
| ) |
| displayed_tasks_with_areas_df = ( |
| displayed_tasks_with_areas_df.groupby("area") |
| .apply( |
| lambda df: cls.get_topk_merge_others( |
| df, "task", "count", val_threshold=min_task_count |
| ) |
| ) |
| .reset_index() |
| ) |
| displayed_tasks_with_areas_df["task"] = ( |
| displayed_tasks_with_areas_df["task"] |
| + " " |
| + displayed_tasks_with_areas_df["count"].apply(str) |
| ) |
| return displayed_tasks_with_areas_df |
|
|
| def get_tasks_sunburst(self, min_task_count, which_df="selected"): |
| if which_df == "selected": |
| df = self.selected_tasks_with_areas_df |
| else: |
| df = self.tasks_with_areas_df |
|
|
| displayed_tasks_with_areas_df = self.get_displayed_tasks_with_areas_df( |
| df, min_task_count |
| ) |
|
|
| return px.sunburst( |
| displayed_tasks_with_areas_df, path=["area", "task"], values="count" |
| ) |
|
|