Spaces:

Metin
/

DataMiningProjectDemo

Sleeping

App Files Files Community

Metin commited on 13 days ago

Commit

fdfe8da

1 Parent(s): 20b7f29

Clean up

Browse files

Files changed (7) hide show

src/config.py +0 -6
src/demo.py +0 -11
src/embedding.py +0 -1
src/gnn.py +5 -22
src/heuristic.py +1 -25
src/utils.py +2 -30
src/visualization.py +1 -1

src/config.py CHANGED Viewed

@@ -56,7 +56,6 @@ class Config(BaseSettings):
     }
     COLOR_MAPPING: dict[str, str] = {
-        # STEM & Natural Sciences -> Emerald (#06d6a0)
         'Biology': '#06d6a0',
         'Chemistry': '#06d6a0',
         'Earth_and_environment': '#06d6a0',
@@ -65,7 +64,6 @@ class Config(BaseSettings):
         'STEM': '#06d6a0',
         'Space': '#06d6a0',
-        # Geography & Places -> Ocean Blue (#118ab2)
         'Africa': '#118ab2',
         'Americas': '#118ab2',
         'Asia': '#118ab2',
@@ -73,7 +71,6 @@ class Config(BaseSettings):
         'Oceania': '#118ab2',
         'Geographical': '#118ab2',
-        # Arts, Entertainment & Culture -> Bubblegum Pink (#ef476f)
         'Entertainment': '#ef476f',
         'Fashion': '#ef476f',
         'Films': '#ef476f',
@@ -83,7 +80,6 @@ class Config(BaseSettings):
         'Visual_arts': '#ef476f',
         'Literature': '#ef476f',
-        # Tech, Engineering & Infrastructure -> Dark Teal (#073b4c)
         'Architecture': '#073b4c',
         'Computing': '#073b4c',
         'Engineering': '#073b4c',
@@ -92,7 +88,6 @@ class Config(BaseSettings):
         'Transportation': '#073b4c',
         'Video_games': '#073b4c',
-        # Society, Humanities & Lifestyle -> Coral Glow (#f78c6b)
         'Biography': '#f78c6b',
         'Food_and_drink': '#f78c6b',
         'Linguistics': '#f78c6b',
@@ -101,7 +96,6 @@ class Config(BaseSettings):
         'Society': '#f78c6b',
         'Sports': '#f78c6b',
-        # Institutions, History & Governance -> Royal Gold (#ffd166)
         'Business_and_economics': '#ffd166',
         'Education': '#ffd166',
         'History': '#ffd166',

     }
     COLOR_MAPPING: dict[str, str] = {
         'Biology': '#06d6a0',
         'Chemistry': '#06d6a0',
         'Earth_and_environment': '#06d6a0',
         'STEM': '#06d6a0',
         'Space': '#06d6a0',
         'Africa': '#118ab2',
         'Americas': '#118ab2',
         'Asia': '#118ab2',
         'Oceania': '#118ab2',
         'Geographical': '#118ab2',
         'Entertainment': '#ef476f',
         'Fashion': '#ef476f',
         'Films': '#ef476f',
         'Visual_arts': '#ef476f',
         'Literature': '#ef476f',
         'Architecture': '#073b4c',
         'Computing': '#073b4c',
         'Engineering': '#073b4c',
         'Transportation': '#073b4c',
         'Video_games': '#073b4c',
         'Biography': '#f78c6b',
         'Food_and_drink': '#f78c6b',
         'Linguistics': '#f78c6b',
         'Society': '#f78c6b',
         'Sports': '#f78c6b',
         'Business_and_economics': '#ffd166',
         'Education': '#ffd166',
         'History': '#ffd166',

src/demo.py CHANGED Viewed

@@ -85,17 +85,6 @@ if "setup_complete" not in st.session_state:
     st.session_state.setup_complete = True
-# node_styles = [
-#     NodeStyle("PERSON", "#FF7F3E", "name", "person"),
-#     NodeStyle("POST", "#2A629A", "content", "description"),
-# ]
-# edge_styles = [
-#     EdgeStyle("FOLLOWS", caption="label", directed=True),
-#     EdgeStyle("POSTED", caption="label", directed=True),
-#     EdgeStyle("QUOTES", caption="label", directed=True),
-# ]
 node_styles = get_node_styles()
 edge_styles = get_edge_styles()

     st.session_state.setup_complete = True
 node_styles = get_node_styles()
 edge_styles = get_edge_styles()

src/embedding.py CHANGED Viewed

@@ -9,7 +9,6 @@ from transformers import AutoModel, AutoTokenizer
 class Embedder:
     def __init__(self, path):
-        # time.sleep(1)
         self.model_name_or_path = path
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)

 class Embedder:
     def __init__(self, path):
         self.model_name_or_path = path
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)

src/gnn.py CHANGED Viewed

@@ -14,8 +14,6 @@ class GNNClassifier(torch.nn.Module):
         self.layers = layers
         self.output_dim = output_dim
-        # IMPROVEMENT 1: Reduce to 2 layers to prevent over-smoothing
-        # If you really need 3 layers, you must add Residual Connections (x = x + conv(x))
         if layers == 2:
             self.conv1 = GCNConv(input_dim, hidden_dim)
             self.conv2 = GCNConv(hidden_dim, output_dim)
@@ -27,15 +25,10 @@ class GNNClassifier(torch.nn.Module):
     def forward(self, data):
         x, edge_index = data.x, data.edge_index
-        # Layer 1
         x = self.conv1(x, edge_index)
         x = F.relu(x)
-        # IMPROVEMENT 2: Higher Dropout (0.5 is standard for citation networks)
-        # This prevents the model from relying too much on specific neighbor connections
         x = F.dropout(x, p=self.dropout_rate, training=self.training)
-        # Layer 2
         x = self.conv2(x, edge_index)
         if self.layers == 3:
@@ -63,8 +56,8 @@ def load_data(version: str = "undirected"):
 def infer_new_node(
     base_data: Data,
     model: torch.nn.Module,
-    new_embedding,                 # shape (768,) list/np array/torch
-    referenced_titles: list[str],  # titles the user selected
     title_to_id: dict[str, int],
     label_mapping: dict[str, int],
     device: torch.device,
@@ -73,11 +66,9 @@ def infer_new_node(
 ):
     model.eval()
-    # Move model to device
     model = model.to(device)
     base_data = base_data.to(device)
-    # --- 1) Prepare new node feature ---
     x_old = base_data.x
     new_x = torch.tensor(new_embedding, dtype=x_old.dtype).view(1, -1)
     new_x = new_x.to(device)
@@ -85,7 +76,6 @@ def infer_new_node(
     new_id = x.size(0) - 1
-    # --- 2) Build new edges that attach the node ---
     src_list = []
     tgt_list = []
@@ -94,19 +84,13 @@ def infer_new_node(
             continue
         old_id = title_to_id[t]
-        # If you want new node to be influenced by referenced nodes in 1 hop,
-        # you need edges old -> new (incoming to new).
         src_list.append(old_id)
         tgt_list.append(new_id)
-        # Optional: also add new -> old to make it undirected / symmetric
         if make_undirected_for_new_node:
             src_list.append(new_id)
             tgt_list.append(old_id)
-    # If the user picked nothing, the node is isolated; GCNConv can still work
-    # because it adds self-loops by default, but performance may be weak.
     if len(src_list) > 0 and use_edges:
         new_edges = torch.tensor([src_list, tgt_list], dtype=torch.long)
         new_edges = new_edges.to(device)
@@ -114,19 +98,18 @@ def infer_new_node(
     else:
         edge_index = base_data.edge_index
-    # --- 3) Run inference on the augmented graph ---
     data_aug = Data(x=x, edge_index=edge_index).to(device)
     with torch.no_grad():
-        out = model(data_aug)  # your model returns raw logits
         log_probs = F.log_softmax(out, dim=1)
-        log_probs = log_probs[new_id]  # get log-probs for the new node only
         pred_id = int(torch.argmax(log_probs).item())
     inv_label_mapping = {v: k for k, v in label_mapping.items()}
     pred_label = inv_label_mapping[pred_id]
-    probs = log_probs.exp().detach().cpu()  # convert log-probs -> probs
     columns = ["Class", "Score"]
     result_df = pd.DataFrame(

         self.layers = layers
         self.output_dim = output_dim
         if layers == 2:
             self.conv1 = GCNConv(input_dim, hidden_dim)
             self.conv2 = GCNConv(hidden_dim, output_dim)
     def forward(self, data):
         x, edge_index = data.x, data.edge_index
         x = self.conv1(x, edge_index)
         x = F.relu(x)
         x = F.dropout(x, p=self.dropout_rate, training=self.training)
         x = self.conv2(x, edge_index)
         if self.layers == 3:
 def infer_new_node(
     base_data: Data,
     model: torch.nn.Module,
+    new_embedding,
+    referenced_titles: list[str],
     title_to_id: dict[str, int],
     label_mapping: dict[str, int],
     device: torch.device,
 ):
     model.eval()
     model = model.to(device)
     base_data = base_data.to(device)
     x_old = base_data.x
     new_x = torch.tensor(new_embedding, dtype=x_old.dtype).view(1, -1)
     new_x = new_x.to(device)
     new_id = x.size(0) - 1
     src_list = []
     tgt_list = []
             continue
         old_id = title_to_id[t]
         src_list.append(old_id)
         tgt_list.append(new_id)
         if make_undirected_for_new_node:
             src_list.append(new_id)
             tgt_list.append(old_id)
     if len(src_list) > 0 and use_edges:
         new_edges = torch.tensor([src_list, tgt_list], dtype=torch.long)
         new_edges = new_edges.to(device)
     else:
         edge_index = base_data.edge_index
     data_aug = Data(x=x, edge_index=edge_index).to(device)
     with torch.no_grad():
+        out = model(data_aug)
         log_probs = F.log_softmax(out, dim=1)
+        log_probs = log_probs[new_id]
         pred_id = int(torch.argmax(log_probs).item())
     inv_label_mapping = {v: k for k, v in label_mapping.items()}
     pred_label = inv_label_mapping[pred_id]
+    probs = log_probs.exp().detach().cpu()
     columns = ["Class", "Score"]
     result_df = pd.DataFrame(

src/heuristic.py CHANGED Viewed

@@ -13,70 +13,46 @@ def predict_topic_nth_degree(
     is_weighted: bool = False,
     decay_factor: float = 1.0,
 ) -> Optional[str]:
-    """
-    Predicts topic based on neighbors up to n-degrees away.
-    Args:
-        max_depth: How many hops to traverse (1 = direct neighbors, 2 = neighbors of neighbors).
-        decay_factor: Multiplier for distance. 1.0 = no decay.
-                      0.5 means a neighbor at depth 2 has half the voting power of depth 1.
-    """
-    # 1. Setup BFS
-    # Queue stores: (current_node_name, current_depth)
     queue = deque()
-    # We maintain a visited set to avoid cycles and processing the same node twice
     visited = set()
     visited.add(new_article_title)
-    # 2. Initialize BFS with the "Virtual" First Hop
-    # We iterate the input list 'edges' manually because the new article isn't in G.
     for ref in edges:
         if ref in G and ref not in visited:
             visited.add(ref)
-            queue.append((ref, 1))  # Depth 1
     if not queue:
         return None
     topic_scores = defaultdict(float)
-    # 3. Process BFS
     while queue:
         current_node, current_depth = queue.popleft()
-        # --- Score Calculation ---
         node_data = G.nodes[current_node]
         topic = node_data.get("label")
         if topic:
-            # Determine base weight
             if is_weighted:
                 neighbor_embedding = node_data["embedding"]
-                # Calculate similarity
                 base_score = cosine_similarity(
                     [new_article_embedding], [neighbor_embedding]
                 )[0][0]
             else:
                 base_score = 1.0
-            # Apply Distance Decay
-            # Formula: Score * (decay ^ (depth - 1))
-            # Depth 1: Score * 1
-            # Depth 2: Score * decay
             weighted_score = base_score * (decay_factor ** (current_depth - 1))
             topic_scores[topic] += weighted_score
-        # --- Expand to next level if within limit ---
         if current_depth < max_depth:
             for neighbor in G.neighbors(current_node):
                 if neighbor not in visited:
                     visited.add(neighbor)
                     queue.append((neighbor, current_depth + 1))
-    # 4. Determine Winner
     if not topic_scores:
         return None

     is_weighted: bool = False,
     decay_factor: float = 1.0,
 ) -> Optional[str]:
     queue = deque()
     visited = set()
     visited.add(new_article_title)
     for ref in edges:
         if ref in G and ref not in visited:
             visited.add(ref)
+            queue.append((ref, 1))
     if not queue:
         return None
     topic_scores = defaultdict(float)
     while queue:
         current_node, current_depth = queue.popleft()
         node_data = G.nodes[current_node]
         topic = node_data.get("label")
         if topic:
             if is_weighted:
                 neighbor_embedding = node_data["embedding"]
                 base_score = cosine_similarity(
                     [new_article_embedding], [neighbor_embedding]
                 )[0][0]
             else:
                 base_score = 1.0
             weighted_score = base_score * (decay_factor ** (current_depth - 1))
             topic_scores[topic] += weighted_score
         if current_depth < max_depth:
             for neighbor in G.neighbors(current_node):
                 if neighbor not in visited:
                     visited.add(neighbor)
                     queue.append((neighbor, current_depth + 1))
     if not topic_scores:
         return None

src/utils.py CHANGED Viewed

@@ -48,42 +48,20 @@ def gather_neighbors(
 def get_neighbors_for_visualizer(graph: nx.Graph, start_node, depth=1):
-    """
-    Returns the neighbors of a node within a given depth in a format
-    compatible with Cytoscape-style visualizers.
-    Args:
-        graph (nx.Graph): The source NetworkX graph.
-        start_node: The title/ID of the node to start from.
-        depth (int): How many hops (degrees of separation) to traverse.
-    Returns:
-        dict: A dictionary containing 'nodes' and 'edges' formatted for the visualizer.
-    """
-    # 1. Create a subgraph of neighbors within the specified depth
-    # If the node doesn't exist, return empty structure or raise error
     if start_node not in graph:
         return {"nodes": [], "edges": []}
     subgraph = nx.ego_graph(graph, start_node, radius=depth)
-    # 2. Prepare data structures
     nodes_data = []
     edges_data = []
-    # Helper to map actual node names (titles) to integer IDs required by the format
-    # The example uses 1-based integers for IDs.
     node_to_id_map = {}
     current_id = 1
-    # 3. Process Nodes
     for node in subgraph.nodes():
-        # Assign an integer ID
         node_to_id_map[node] = current_id
-        # Get attributes (safely default if label is missing)
-        # We ignore 'embedding' as requested
         node_attrs = subgraph.nodes[node]
         label = node_attrs.get("label", "Unknown")
@@ -91,24 +69,20 @@ def get_neighbors_for_visualizer(graph: nx.Graph, start_node, depth=1):
             "data": {
                 "id": current_id,
                 "label": label,
-                "name": str(node),  # Using the node title/ID as 'name'
             }
         }
         nodes_data.append(node_obj)
         current_id += 1
-    # 4. Process Edges
-    # Edge IDs usually need to be unique strings or integers.
-    # We continue the counter from where nodes left off to ensure uniqueness.
     edge_id_counter = current_id
     for u, v in subgraph.edges():
         source_id = node_to_id_map[u]
         target_id = node_to_id_map[v]
-        # Get edge attributes if they exist (e.g., relationship type)
         edge_attrs = subgraph.edges[u, v]
-        edge_label = edge_attrs.get("label", "CITES")  # Default label if none exists
         edge_obj = {
             "data": {
@@ -121,7 +95,6 @@ def get_neighbors_for_visualizer(graph: nx.Graph, start_node, depth=1):
         edges_data.append(edge_obj)
         edge_id_counter += 1
-    # 5. Return the final structure
     return {"nodes": nodes_data, "edges": edges_data}
@@ -136,5 +109,4 @@ if __name__ == "__main__":
     neighbors = gather_neighbors(graph, test_title, test_references, depth=2)
-    # print(f"References for '{test_title}': {test_references}")
     print(f"Neighbors of '{test_title}': {neighbors}")

 def get_neighbors_for_visualizer(graph: nx.Graph, start_node, depth=1):
     if start_node not in graph:
         return {"nodes": [], "edges": []}
     subgraph = nx.ego_graph(graph, start_node, radius=depth)
     nodes_data = []
     edges_data = []
     node_to_id_map = {}
     current_id = 1
     for node in subgraph.nodes():
         node_to_id_map[node] = current_id
         node_attrs = subgraph.nodes[node]
         label = node_attrs.get("label", "Unknown")
             "data": {
                 "id": current_id,
                 "label": label,
+                "name": str(node),
             }
         }
         nodes_data.append(node_obj)
         current_id += 1
     edge_id_counter = current_id
     for u, v in subgraph.edges():
         source_id = node_to_id_map[u]
         target_id = node_to_id_map[v]
         edge_attrs = subgraph.edges[u, v]
+        edge_label = edge_attrs.get("label", "CITES")
         edge_obj = {
             "data": {
         edges_data.append(edge_obj)
         edge_id_counter += 1
     return {"nodes": nodes_data, "edges": edges_data}
     neighbors = gather_neighbors(graph, test_title, test_references, depth=2)
     print(f"Neighbors of '{test_title}': {neighbors}")

src/visualization.py CHANGED Viewed

@@ -4,7 +4,7 @@ from src.config import config
 def get_node_styles() -> list[NodeStyle]:
     node_styles = []
     for class_name in config.ICON_MAPPING.keys():
-        color = config.COLOR_MAPPING.get(class_name, "#888888")  # Default gray if not found
         icon = config.ICON_MAPPING.get(class_name, None)
         node_styles.append(NodeStyle(
             label=class_name,

 def get_node_styles() -> list[NodeStyle]:
     node_styles = []
     for class_name in config.ICON_MAPPING.keys():
+        color = config.COLOR_MAPPING.get(class_name, "#888888")
         icon = config.ICON_MAPPING.get(class_name, None)
         node_styles.append(NodeStyle(
             label=class_name,