phi-knowledge-graph

Running on Zero

App Files Files Community

vietexob commited on 18 days ago

Commit

a110d08

1 Parent(s): 4931ab5

Major bug fixed

Browse files

Files changed (3) hide show

CLAUDE.md +15 -2
app.py +82 -34
llm_graph.py +9 -10

CLAUDE.md CHANGED Viewed

@@ -10,7 +10,10 @@ This is a Text2Graph application that extracts knowledge graphs from natural lan
 - **app.py**: Main Gradio application with UI components, visualization logic, and caching
 - **llm_graph.py**: Core LLMGraph class that handles model selection and knowledge graph extraction
 - **cache/**: Directory for caching visualization data (first example is pre-cached for performance)
 ## Key Components
@@ -46,11 +49,14 @@ AZURE_EMBEDDING_API_VERSION=<embedding_api_version>
 # Install dependencies
 pip install -r requirements.txt
-# Run the Gradio app
 python app.py
 # Test model extraction directly
 python llm_graph.py
 ```
 ## Key Dependencies
@@ -87,4 +93,11 @@ The application expects JSON output with this schema:
 - First example is automatically cached for performance on startup
 - Cache files stored in `cache/` directory as pickle files
 - Working directory `sample/` is cleared and recreated on each run
-- GraphML files generated by LightRAG for Azure OpenAI model backend

 - **app.py**: Main Gradio application with UI components, visualization logic, and caching
 - **llm_graph.py**: Core LLMGraph class that handles model selection and knowledge graph extraction
+- **visualize.py**: Standalone script for visualizing GraphML files from LightRAG output
+- **data/**: Contains sample texts in multiple languages and system prompt templates
 - **cache/**: Directory for caching visualization data (first example is pre-cached for performance)
+- **sample/**: Working directory for LightRAG processing, cleared and recreated on each run
 ## Key Components
 # Install dependencies
 pip install -r requirements.txt
+# Run the Gradio app locally (default port 7860)
 python app.py
 # Test model extraction directly
 python llm_graph.py
+# Visualize existing GraphML files (requires sample/ directory with GraphML file)
+python visualize.py
 ```
 ## Key Dependencies
 - First example is automatically cached for performance on startup
 - Cache files stored in `cache/` directory as pickle files
 - Working directory `sample/` is cleared and recreated on each run
+- GraphML files generated by LightRAG for Azure OpenAI model backend
+## Environment Configuration
+- Uses `.env` file for API keys and endpoints (see Environment Setup section)
+- Designed for Hugging Face Spaces deployment (see README.md frontmatter)
+- SpaCy model loading is handled automatically by the application
+- No additional configuration files (package.json, pyproject.toml, etc.) required

app.py CHANGED Viewed

@@ -1,15 +1,17 @@
 import os
 import time
 import spacy
 import shutil
 import pickle
 import random
-import hashlib
 import logging
 import asyncio
 import warnings
 import rapidjson
 import gradio as gr
 import networkx as nx
@@ -29,9 +31,6 @@ SUBTITLE = "✨ Extract and visualize knowledge graphs from texts in any languag
 MIN_CHARS = 20
 MAX_CHARS = 3500
-# Keep track of all processed texts
-doc_ids = []
 # Basic CSS for styling
 CUSTOM_CSS = """
 .gradio-container {
@@ -43,7 +42,6 @@ CUSTOM_CSS = """
 CACHE_DIR = "./cache"
 WORKING_DIR = "./sample"
 EXAMPLE_CACHE_FILE = os.path.join(CACHE_DIR, "first_example_cache.pkl")
-GRAPHML_FILE = WORKING_DIR + "/graph_chunk_entity_relation.graphml"
 # Load the sample texts
 text_en_file1 = "./data/sample1_en.txt"
@@ -76,7 +74,7 @@ os.makedirs(WORKING_DIR, exist_ok=True)
 def get_random_light_color():
     """
-    Color utilities
     """
     r = random.randint(140, 255)
@@ -87,7 +85,7 @@ def get_random_light_color():
 def handle_text(text=""):
     """
-    Text preprocessing
     """
     # Catch empty text
@@ -96,9 +94,9 @@ def handle_text(text=""):
     return " ".join(text.split())
-def extract_kg(text="", model_name=MODEL_LIST[0], model=None):
     """
-    Extract knowledge graph from text
     """
     # Catch empty text
@@ -109,7 +107,15 @@ def extract_kg(text="", model_name=MODEL_LIST[0], model=None):
     try:
         start_time = time.time()
-        result = model.extract(text, model_name)
         end_time = time.time()
         duration = end_time - start_time
@@ -155,7 +161,7 @@ def find_token_indices(doc, substring, text):
 def create_custom_entity_viz(data, full_text, type_col="type"):
     """
-    Create custom entity visualization using spaCy's displacy
     """
     nlp = spacy.blank("xx")
@@ -201,9 +207,9 @@ def create_custom_entity_viz(data, full_text, type_col="type"):
     return styled_html
-def create_graph(json_data, model_name=MODEL_LIST[0]):
     """
-    Create interactive knowledge graph using pyvis
     """
     if model_name == MODEL_LIST[0]:
@@ -227,12 +233,12 @@ def create_graph(json_data, model_name=MODEL_LIST[0]):
                 label = edge.get('label', 'related')
                 G.add_edge(edge['from'], edge['to'], title=label, label=label)
     else:
-        G = nx.read_graphml(GRAPHML_FILE)
     # Create network visualization
     network = Network(
         width="100%",
-        # height="700px",
         height="100vh",
         notebook=False,
         bgcolor="#f8fafc",
@@ -281,9 +287,49 @@ def create_graph(json_data, model_name=MODEL_LIST[0]):
         allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
         allowpaymentrequest="" frameborder="0" srcdoc='{html}'></iframe>"""
 def process_and_visualize(text, model_name, progress=gr.Progress()):
     """
-    Process text and visualize knowledge graph and entities
     """
     if not text or not model_name:
@@ -310,25 +356,27 @@ def process_and_visualize(text, model_name, progress=gr.Progress()):
     if len(text) > MAX_CHARS:
         raise gr.Error(f"⚠️ Text is too long! Please provide no more than {MAX_CHARS} characters.")
-    if model_name == MODEL_LIST[1]:
-        # Compute the unique hash for the document
-        doc_id = hashlib.md5(text.strip().encode()).hexdigest()
-        if doc_id not in doc_ids:
-            doc_ids.append(doc_id)
-            # Clear the working directory if it exists
-            if os.path.exists(WORKING_DIR):
-                shutil.rmtree(WORKING_DIR)
-            os.makedirs(WORKING_DIR, exist_ok=True)
-            # Initialize the LLMGraph model
-            model = LLMGraph()
-            asyncio.run(model.initialize_rag())
     # Continue with normal processing if cache fails
     progress(0, desc="Starting extraction...")
-    json_data = extract_kg(text, model_name, model)
     progress(0.5, desc="Creating entity visualization...")
     if model_name == MODEL_LIST[0]:
@@ -337,7 +385,7 @@ def process_and_visualize(text, model_name, progress=gr.Progress()):
         entities_viz = create_custom_entity_viz(json_data, text, type_col="entity_type")
     progress(0.8, desc="Building knowledge graph...")
-    graph_html = create_graph(json_data, model_name)
     node_count = len(json_data["nodes"])
     edge_count = len(json_data["edges"])
@@ -383,7 +431,7 @@ def generate_first_example():
             model_name = MODEL_LIST[0] if MODEL_LIST else None
             # Initialize the LLMGraph model
-            model = LLMGraph()
             asyncio.run(model.initialize_rag())
             # Extract data

 import os
+import re
 import time
 import spacy
 import shutil
 import pickle
 import random
+import hashlib
 import logging
 import asyncio
 import warnings
 import rapidjson
+import unicodedata
 import gradio as gr
 import networkx as nx
 MIN_CHARS = 20
 MAX_CHARS = 3500
 # Basic CSS for styling
 CUSTOM_CSS = """
 .gradio-container {
 CACHE_DIR = "./cache"
 WORKING_DIR = "./sample"
 EXAMPLE_CACHE_FILE = os.path.join(CACHE_DIR, "first_example_cache.pkl")
 # Load the sample texts
 text_en_file1 = "./data/sample1_en.txt"
 def get_random_light_color():
     """
+    Color utilities.
     """
     r = random.randint(140, 255)
 def handle_text(text=""):
     """
+    Text preprocessing.
     """
     # Catch empty text
     return " ".join(text.split())
+def extract_kg(text="", model_name=MODEL_LIST[0], model=None, graph_file=""):
     """
+    Extract knowledge graph from text.
     """
     # Catch empty text
     try:
         start_time = time.time()
+        if model_name == MODEL_LIST[1] and os.path.exists(graph_file):
+            # Load the graph directly from cache
+            logging.info(f"Loading graph from cache: {graph_file}")
+            G = nx.read_graphml(graph_file)
+            # Convert the graph to node-link data format
+            result = nx.node_link_data(G, edges="edges")
+        else:
+            result = model.extract(text, model_name, graph_file)
         end_time = time.time()
         duration = end_time - start_time
 def create_custom_entity_viz(data, full_text, type_col="type"):
     """
+    Create custom entity visualization using spaCy's displaCy.
     """
     nlp = spacy.blank("xx")
     return styled_html
+def create_graph(json_data, model_name=MODEL_LIST[0], graph_file=""):
     """
+    Create interactive knowledge graph using Pyvis.
     """
     if model_name == MODEL_LIST[0]:
                 label = edge.get('label', 'related')
                 G.add_edge(edge['from'], edge['to'], title=label, label=label)
     else:
+        assert graph_file, "Graph file path cannot be empty or None."
+        G = nx.read_graphml(graph_file)
     # Create network visualization
     network = Network(
         width="100%",
         height="100vh",
         notebook=False,
         bgcolor="#f8fafc",
         allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
         allowpaymentrequest="" frameborder="0" srcdoc='{html}'></iframe>"""
+def fuzzy_text_hash(text, algorithm='md5'):
+    """
+    Generate a hash that treats nearly identical texts as the same.
+    This function normalizes text by:
+    - Converting to lowercase
+    - Removing punctuation and special characters
+    - Normalizing whitespace (multiple spaces become single space)
+    - Removing leading/trailing whitespace
+    - Normalizing Unicode characters
+    Args:
+        text (str): The input text to hash
+        algorithm (str): Hash algorithm to use ('md5', 'sha1', 'sha256', 'sha512')
+    Returns:
+        str: Hexadecimal hash string
+    """
+    # Normalize Unicode characters (decompose accented characters, etc.)
+    normalized = unicodedata.normalize('NFKD', text)
+    # Convert to lowercase
+    normalized = normalized.lower()
+    # Remove all punctuation and special characters, keep only alphanumeric and spaces
+    normalized = re.sub(r'[^\w\s]', '', normalized)
+    # Normalize whitespace: replace multiple whitespace chars with single space
+    normalized = re.sub(r'\s+', ' ', normalized)
+    # Strip leading and trailing whitespace
+    normalized = normalized.strip()
+    # Create hash
+    hash_obj = hashlib.new(algorithm)
+    hash_obj.update(normalized.encode('utf-8'))
+    return hash_obj.hexdigest()
 def process_and_visualize(text, model_name, progress=gr.Progress()):
     """
+    Process text and visualize knowledge graph and entities.
     """
     if not text or not model_name:
     if len(text) > MAX_CHARS:
         raise gr.Error(f"⚠️ Text is too long! Please provide no more than {MAX_CHARS} characters.")
+    # Compute the unique hash for the document
+    # doc_id = hashlib.md5(text.strip().encode()).hexdigest()
+    doc_id = fuzzy_text_hash(text.strip())
+    logging.info(f"Document ID: {doc_id}")
+    # Create a working directory based on the hash
+    my_working_dir = os.path.join(WORKING_DIR, doc_id)
+    graph_file = os.path.join(my_working_dir, "graph_chunk_entity_relation.graphml")
+    # Check if the working directory exists (the doc has been processed before)
+    if not os.path.exists(my_working_dir):
+        # Create the working directory
+        os.makedirs(my_working_dir, exist_ok=True)
+    # Initialize the LLMGraph model
+    model = LLMGraph(working_dir=my_working_dir)
+    asyncio.run(model.initialize_rag())
     # Continue with normal processing if cache fails
     progress(0, desc="Starting extraction...")
+    json_data = extract_kg(text, model_name, model, graph_file)
     progress(0.5, desc="Creating entity visualization...")
     if model_name == MODEL_LIST[0]:
         entities_viz = create_custom_entity_viz(json_data, text, type_col="entity_type")
     progress(0.8, desc="Building knowledge graph...")
+    graph_html = create_graph(json_data, model_name, graph_file)
     node_count = len(json_data["nodes"])
     edge_count = len(json_data["edges"])
             model_name = MODEL_LIST[0] if MODEL_LIST else None
             # Initialize the LLMGraph model
+            model = LLMGraph(working_dir=WORKING_DIR)
             asyncio.run(model.initialize_rag())
             # Extract data

llm_graph.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import time
-# import shutil
 import numpy as np
 import networkx as nx
@@ -28,9 +27,6 @@ AZURE_OPENAI_ENDPOINT = os.environ["AZURE_OPENAI_ENDPOINT"]
 AZURE_EMBEDDING_DEPLOYMENT = os.environ["AZURE_EMBEDDING_DEPLOYMENT"]
 AZURE_EMBEDDING_API_VERSION = os.environ["AZURE_EMBEDDING_API_VERSION"]
-WORKING_DIR = "./sample"
-GRAPHML_FILE = WORKING_DIR + "/graph_chunk_entity_relation.graphml"
 MODEL_LIST = [
   "EmergentMethods/Phi-3-mini-128k-instruct-graph",
   "OpenAI/GPT-4.1-mini",
@@ -53,7 +49,7 @@ class LLMGraph:
         if self.rag is None:
             self.rag = LightRAG(
-                working_dir=WORKING_DIR,
                 llm_model_func=self._llm_model_func,
                 embedding_func=EmbeddingFunc(
                     embedding_dim=embedding_dimension,
@@ -79,7 +75,7 @@ class LLMGraph:
     #     return True
-    def __init__(self):
         """
         Initialize the Phi3InstructGraph with a specified model.
         """
@@ -91,6 +87,7 @@ class LLMGraph:
         )
         self.rag = None # Lazy loading of RAG instance
     def _generate(self, messages):
         """
@@ -133,7 +130,7 @@ class LLMGraph:
         return messages
-    def extract(self, text, model_name=MODEL_LIST[0]):
         """
         Extract knowledge graph in structured format from text.
         """
@@ -145,15 +142,17 @@ class LLMGraph:
             json_graph = self._generate(messages)
             return json_graph
         else:
             # Use LightRAG with Azure OpenAI
             self.rag.insert(text) # Insert the text into the RAG storage
-            # Wait for GRAPHML_FILE to be created
-            while not os.path.exists(GRAPHML_FILE):
                 time.sleep(0.1) # Sleep for 0.1 seconds before checking again
             # Extract dict format of the knowledge graph
-            G = nx.read_graphml(GRAPHML_FILE)
             # Convert the graph to node-link data format
             dict_graph = nx.node_link_data(G, edges="edges")

 import os
 import time
 import numpy as np
 import networkx as nx
 AZURE_EMBEDDING_DEPLOYMENT = os.environ["AZURE_EMBEDDING_DEPLOYMENT"]
 AZURE_EMBEDDING_API_VERSION = os.environ["AZURE_EMBEDDING_API_VERSION"]
 MODEL_LIST = [
   "EmergentMethods/Phi-3-mini-128k-instruct-graph",
   "OpenAI/GPT-4.1-mini",
         if self.rag is None:
             self.rag = LightRAG(
+                working_dir=self.working_dir,
                 llm_model_func=self._llm_model_func,
                 embedding_func=EmbeddingFunc(
                     embedding_dim=embedding_dimension,
     #     return True
+    def __init__(self, working_dir):
         """
         Initialize the Phi3InstructGraph with a specified model.
         """
         )
         self.rag = None # Lazy loading of RAG instance
+        self.working_dir = working_dir
     def _generate(self, messages):
         """
         return messages
+    def extract(self, text, model_name=MODEL_LIST[0], graph_file=""):
         """
         Extract knowledge graph in structured format from text.
         """
             json_graph = self._generate(messages)
             return json_graph
         else:
+            assert graph_file, "Graph file path cannot be empty or None."
             # Use LightRAG with Azure OpenAI
             self.rag.insert(text) # Insert the text into the RAG storage
+            # Wait for the graph file to be created
+            while not os.path.exists(graph_file):
                 time.sleep(0.1) # Sleep for 0.1 seconds before checking again
             # Extract dict format of the knowledge graph
+            G = nx.read_graphml(graph_file)
             # Convert the graph to node-link data format
             dict_graph = nx.node_link_data(G, edges="edges")