vietexob commited on
Commit
a110d08
·
1 Parent(s): 4931ab5

Major bug fixed

Browse files
Files changed (3) hide show
  1. CLAUDE.md +15 -2
  2. app.py +82 -34
  3. llm_graph.py +9 -10
CLAUDE.md CHANGED
@@ -10,7 +10,10 @@ This is a Text2Graph application that extracts knowledge graphs from natural lan
10
 
11
  - **app.py**: Main Gradio application with UI components, visualization logic, and caching
12
  - **llm_graph.py**: Core LLMGraph class that handles model selection and knowledge graph extraction
 
 
13
  - **cache/**: Directory for caching visualization data (first example is pre-cached for performance)
 
14
 
15
  ## Key Components
16
 
@@ -46,11 +49,14 @@ AZURE_EMBEDDING_API_VERSION=<embedding_api_version>
46
  # Install dependencies
47
  pip install -r requirements.txt
48
 
49
- # Run the Gradio app
50
  python app.py
51
 
52
  # Test model extraction directly
53
  python llm_graph.py
 
 
 
54
  ```
55
 
56
  ## Key Dependencies
@@ -87,4 +93,11 @@ The application expects JSON output with this schema:
87
  - First example is automatically cached for performance on startup
88
  - Cache files stored in `cache/` directory as pickle files
89
  - Working directory `sample/` is cleared and recreated on each run
90
- - GraphML files generated by LightRAG for Azure OpenAI model backend
 
 
 
 
 
 
 
 
10
 
11
  - **app.py**: Main Gradio application with UI components, visualization logic, and caching
12
  - **llm_graph.py**: Core LLMGraph class that handles model selection and knowledge graph extraction
13
+ - **visualize.py**: Standalone script for visualizing GraphML files from LightRAG output
14
+ - **data/**: Contains sample texts in multiple languages and system prompt templates
15
  - **cache/**: Directory for caching visualization data (first example is pre-cached for performance)
16
+ - **sample/**: Working directory for LightRAG processing, cleared and recreated on each run
17
 
18
  ## Key Components
19
 
 
49
  # Install dependencies
50
  pip install -r requirements.txt
51
 
52
+ # Run the Gradio app locally (default port 7860)
53
  python app.py
54
 
55
  # Test model extraction directly
56
  python llm_graph.py
57
+
58
+ # Visualize existing GraphML files (requires sample/ directory with GraphML file)
59
+ python visualize.py
60
  ```
61
 
62
  ## Key Dependencies
 
93
  - First example is automatically cached for performance on startup
94
  - Cache files stored in `cache/` directory as pickle files
95
  - Working directory `sample/` is cleared and recreated on each run
96
+ - GraphML files generated by LightRAG for Azure OpenAI model backend
97
+
98
+ ## Environment Configuration
99
+
100
+ - Uses `.env` file for API keys and endpoints (see Environment Setup section)
101
+ - Designed for Hugging Face Spaces deployment (see README.md frontmatter)
102
+ - SpaCy model loading is handled automatically by the application
103
+ - No additional configuration files (package.json, pyproject.toml, etc.) required
app.py CHANGED
@@ -1,15 +1,17 @@
1
  import os
 
2
  import time
3
  import spacy
4
  import shutil
5
  import pickle
6
  import random
7
- import hashlib
8
 
 
9
  import logging
10
  import asyncio
11
  import warnings
12
  import rapidjson
 
13
 
14
  import gradio as gr
15
  import networkx as nx
@@ -29,9 +31,6 @@ SUBTITLE = "✨ Extract and visualize knowledge graphs from texts in any languag
29
  MIN_CHARS = 20
30
  MAX_CHARS = 3500
31
 
32
- # Keep track of all processed texts
33
- doc_ids = []
34
-
35
  # Basic CSS for styling
36
  CUSTOM_CSS = """
37
  .gradio-container {
@@ -43,7 +42,6 @@ CUSTOM_CSS = """
43
  CACHE_DIR = "./cache"
44
  WORKING_DIR = "./sample"
45
  EXAMPLE_CACHE_FILE = os.path.join(CACHE_DIR, "first_example_cache.pkl")
46
- GRAPHML_FILE = WORKING_DIR + "/graph_chunk_entity_relation.graphml"
47
 
48
  # Load the sample texts
49
  text_en_file1 = "./data/sample1_en.txt"
@@ -76,7 +74,7 @@ os.makedirs(WORKING_DIR, exist_ok=True)
76
 
77
  def get_random_light_color():
78
  """
79
- Color utilities
80
  """
81
 
82
  r = random.randint(140, 255)
@@ -87,7 +85,7 @@ def get_random_light_color():
87
 
88
  def handle_text(text=""):
89
  """
90
- Text preprocessing
91
  """
92
 
93
  # Catch empty text
@@ -96,9 +94,9 @@ def handle_text(text=""):
96
 
97
  return " ".join(text.split())
98
 
99
- def extract_kg(text="", model_name=MODEL_LIST[0], model=None):
100
  """
101
- Extract knowledge graph from text
102
  """
103
 
104
  # Catch empty text
@@ -109,7 +107,15 @@ def extract_kg(text="", model_name=MODEL_LIST[0], model=None):
109
 
110
  try:
111
  start_time = time.time()
112
- result = model.extract(text, model_name)
 
 
 
 
 
 
 
 
113
 
114
  end_time = time.time()
115
  duration = end_time - start_time
@@ -155,7 +161,7 @@ def find_token_indices(doc, substring, text):
155
 
156
  def create_custom_entity_viz(data, full_text, type_col="type"):
157
  """
158
- Create custom entity visualization using spaCy's displacy
159
  """
160
 
161
  nlp = spacy.blank("xx")
@@ -201,9 +207,9 @@ def create_custom_entity_viz(data, full_text, type_col="type"):
201
 
202
  return styled_html
203
 
204
- def create_graph(json_data, model_name=MODEL_LIST[0]):
205
  """
206
- Create interactive knowledge graph using pyvis
207
  """
208
 
209
  if model_name == MODEL_LIST[0]:
@@ -227,12 +233,12 @@ def create_graph(json_data, model_name=MODEL_LIST[0]):
227
  label = edge.get('label', 'related')
228
  G.add_edge(edge['from'], edge['to'], title=label, label=label)
229
  else:
230
- G = nx.read_graphml(GRAPHML_FILE)
 
231
 
232
  # Create network visualization
233
  network = Network(
234
  width="100%",
235
- # height="700px",
236
  height="100vh",
237
  notebook=False,
238
  bgcolor="#f8fafc",
@@ -281,9 +287,49 @@ def create_graph(json_data, model_name=MODEL_LIST[0]):
281
  allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
282
  allowpaymentrequest="" frameborder="0" srcdoc='{html}'></iframe>"""
283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  def process_and_visualize(text, model_name, progress=gr.Progress()):
285
  """
286
- Process text and visualize knowledge graph and entities
287
  """
288
 
289
  if not text or not model_name:
@@ -310,25 +356,27 @@ def process_and_visualize(text, model_name, progress=gr.Progress()):
310
  if len(text) > MAX_CHARS:
311
  raise gr.Error(f"⚠️ Text is too long! Please provide no more than {MAX_CHARS} characters.")
312
 
313
- if model_name == MODEL_LIST[1]:
314
- # Compute the unique hash for the document
315
- doc_id = hashlib.md5(text.strip().encode()).hexdigest()
316
-
317
- if doc_id not in doc_ids:
318
- doc_ids.append(doc_id)
319
-
320
- # Clear the working directory if it exists
321
- if os.path.exists(WORKING_DIR):
322
- shutil.rmtree(WORKING_DIR)
323
- os.makedirs(WORKING_DIR, exist_ok=True)
324
-
325
- # Initialize the LLMGraph model
326
- model = LLMGraph()
327
- asyncio.run(model.initialize_rag())
 
 
328
 
329
  # Continue with normal processing if cache fails
330
  progress(0, desc="Starting extraction...")
331
- json_data = extract_kg(text, model_name, model)
332
 
333
  progress(0.5, desc="Creating entity visualization...")
334
  if model_name == MODEL_LIST[0]:
@@ -337,7 +385,7 @@ def process_and_visualize(text, model_name, progress=gr.Progress()):
337
  entities_viz = create_custom_entity_viz(json_data, text, type_col="entity_type")
338
 
339
  progress(0.8, desc="Building knowledge graph...")
340
- graph_html = create_graph(json_data, model_name)
341
 
342
  node_count = len(json_data["nodes"])
343
  edge_count = len(json_data["edges"])
@@ -383,7 +431,7 @@ def generate_first_example():
383
  model_name = MODEL_LIST[0] if MODEL_LIST else None
384
 
385
  # Initialize the LLMGraph model
386
- model = LLMGraph()
387
  asyncio.run(model.initialize_rag())
388
 
389
  # Extract data
 
1
  import os
2
+ import re
3
  import time
4
  import spacy
5
  import shutil
6
  import pickle
7
  import random
 
8
 
9
+ import hashlib
10
  import logging
11
  import asyncio
12
  import warnings
13
  import rapidjson
14
+ import unicodedata
15
 
16
  import gradio as gr
17
  import networkx as nx
 
31
  MIN_CHARS = 20
32
  MAX_CHARS = 3500
33
 
 
 
 
34
  # Basic CSS for styling
35
  CUSTOM_CSS = """
36
  .gradio-container {
 
42
  CACHE_DIR = "./cache"
43
  WORKING_DIR = "./sample"
44
  EXAMPLE_CACHE_FILE = os.path.join(CACHE_DIR, "first_example_cache.pkl")
 
45
 
46
  # Load the sample texts
47
  text_en_file1 = "./data/sample1_en.txt"
 
74
 
75
  def get_random_light_color():
76
  """
77
+ Color utilities.
78
  """
79
 
80
  r = random.randint(140, 255)
 
85
 
86
  def handle_text(text=""):
87
  """
88
+ Text preprocessing.
89
  """
90
 
91
  # Catch empty text
 
94
 
95
  return " ".join(text.split())
96
 
97
+ def extract_kg(text="", model_name=MODEL_LIST[0], model=None, graph_file=""):
98
  """
99
+ Extract knowledge graph from text.
100
  """
101
 
102
  # Catch empty text
 
107
 
108
  try:
109
  start_time = time.time()
110
+ if model_name == MODEL_LIST[1] and os.path.exists(graph_file):
111
+ # Load the graph directly from cache
112
+ logging.info(f"Loading graph from cache: {graph_file}")
113
+ G = nx.read_graphml(graph_file)
114
+
115
+ # Convert the graph to node-link data format
116
+ result = nx.node_link_data(G, edges="edges")
117
+ else:
118
+ result = model.extract(text, model_name, graph_file)
119
 
120
  end_time = time.time()
121
  duration = end_time - start_time
 
161
 
162
  def create_custom_entity_viz(data, full_text, type_col="type"):
163
  """
164
+ Create custom entity visualization using spaCy's displaCy.
165
  """
166
 
167
  nlp = spacy.blank("xx")
 
207
 
208
  return styled_html
209
 
210
+ def create_graph(json_data, model_name=MODEL_LIST[0], graph_file=""):
211
  """
212
+ Create interactive knowledge graph using Pyvis.
213
  """
214
 
215
  if model_name == MODEL_LIST[0]:
 
233
  label = edge.get('label', 'related')
234
  G.add_edge(edge['from'], edge['to'], title=label, label=label)
235
  else:
236
+ assert graph_file, "Graph file path cannot be empty or None."
237
+ G = nx.read_graphml(graph_file)
238
 
239
  # Create network visualization
240
  network = Network(
241
  width="100%",
 
242
  height="100vh",
243
  notebook=False,
244
  bgcolor="#f8fafc",
 
287
  allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
288
  allowpaymentrequest="" frameborder="0" srcdoc='{html}'></iframe>"""
289
 
290
+ def fuzzy_text_hash(text, algorithm='md5'):
291
+ """
292
+ Generate a hash that treats nearly identical texts as the same.
293
+
294
+ This function normalizes text by:
295
+ - Converting to lowercase
296
+ - Removing punctuation and special characters
297
+ - Normalizing whitespace (multiple spaces become single space)
298
+ - Removing leading/trailing whitespace
299
+ - Normalizing Unicode characters
300
+
301
+ Args:
302
+ text (str): The input text to hash
303
+ algorithm (str): Hash algorithm to use ('md5', 'sha1', 'sha256', 'sha512')
304
+
305
+ Returns:
306
+ str: Hexadecimal hash string
307
+ """
308
+
309
+ # Normalize Unicode characters (decompose accented characters, etc.)
310
+ normalized = unicodedata.normalize('NFKD', text)
311
+
312
+ # Convert to lowercase
313
+ normalized = normalized.lower()
314
+
315
+ # Remove all punctuation and special characters, keep only alphanumeric and spaces
316
+ normalized = re.sub(r'[^\w\s]', '', normalized)
317
+
318
+ # Normalize whitespace: replace multiple whitespace chars with single space
319
+ normalized = re.sub(r'\s+', ' ', normalized)
320
+
321
+ # Strip leading and trailing whitespace
322
+ normalized = normalized.strip()
323
+
324
+ # Create hash
325
+ hash_obj = hashlib.new(algorithm)
326
+ hash_obj.update(normalized.encode('utf-8'))
327
+
328
+ return hash_obj.hexdigest()
329
+
330
  def process_and_visualize(text, model_name, progress=gr.Progress()):
331
  """
332
+ Process text and visualize knowledge graph and entities.
333
  """
334
 
335
  if not text or not model_name:
 
356
  if len(text) > MAX_CHARS:
357
  raise gr.Error(f"⚠️ Text is too long! Please provide no more than {MAX_CHARS} characters.")
358
 
359
+ # Compute the unique hash for the document
360
+ # doc_id = hashlib.md5(text.strip().encode()).hexdigest()
361
+ doc_id = fuzzy_text_hash(text.strip())
362
+ logging.info(f"Document ID: {doc_id}")
363
+
364
+ # Create a working directory based on the hash
365
+ my_working_dir = os.path.join(WORKING_DIR, doc_id)
366
+ graph_file = os.path.join(my_working_dir, "graph_chunk_entity_relation.graphml")
367
+
368
+ # Check if the working directory exists (the doc has been processed before)
369
+ if not os.path.exists(my_working_dir):
370
+ # Create the working directory
371
+ os.makedirs(my_working_dir, exist_ok=True)
372
+
373
+ # Initialize the LLMGraph model
374
+ model = LLMGraph(working_dir=my_working_dir)
375
+ asyncio.run(model.initialize_rag())
376
 
377
  # Continue with normal processing if cache fails
378
  progress(0, desc="Starting extraction...")
379
+ json_data = extract_kg(text, model_name, model, graph_file)
380
 
381
  progress(0.5, desc="Creating entity visualization...")
382
  if model_name == MODEL_LIST[0]:
 
385
  entities_viz = create_custom_entity_viz(json_data, text, type_col="entity_type")
386
 
387
  progress(0.8, desc="Building knowledge graph...")
388
+ graph_html = create_graph(json_data, model_name, graph_file)
389
 
390
  node_count = len(json_data["nodes"])
391
  edge_count = len(json_data["edges"])
 
431
  model_name = MODEL_LIST[0] if MODEL_LIST else None
432
 
433
  # Initialize the LLMGraph model
434
+ model = LLMGraph(working_dir=WORKING_DIR)
435
  asyncio.run(model.initialize_rag())
436
 
437
  # Extract data
llm_graph.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
  import time
3
- # import shutil
4
 
5
  import numpy as np
6
  import networkx as nx
@@ -28,9 +27,6 @@ AZURE_OPENAI_ENDPOINT = os.environ["AZURE_OPENAI_ENDPOINT"]
28
  AZURE_EMBEDDING_DEPLOYMENT = os.environ["AZURE_EMBEDDING_DEPLOYMENT"]
29
  AZURE_EMBEDDING_API_VERSION = os.environ["AZURE_EMBEDDING_API_VERSION"]
30
 
31
- WORKING_DIR = "./sample"
32
- GRAPHML_FILE = WORKING_DIR + "/graph_chunk_entity_relation.graphml"
33
-
34
  MODEL_LIST = [
35
  "EmergentMethods/Phi-3-mini-128k-instruct-graph",
36
  "OpenAI/GPT-4.1-mini",
@@ -53,7 +49,7 @@ class LLMGraph:
53
 
54
  if self.rag is None:
55
  self.rag = LightRAG(
56
- working_dir=WORKING_DIR,
57
  llm_model_func=self._llm_model_func,
58
  embedding_func=EmbeddingFunc(
59
  embedding_dim=embedding_dimension,
@@ -79,7 +75,7 @@ class LLMGraph:
79
 
80
  # return True
81
 
82
- def __init__(self):
83
  """
84
  Initialize the Phi3InstructGraph with a specified model.
85
  """
@@ -91,6 +87,7 @@ class LLMGraph:
91
  )
92
 
93
  self.rag = None # Lazy loading of RAG instance
 
94
 
95
  def _generate(self, messages):
96
  """
@@ -133,7 +130,7 @@ class LLMGraph:
133
 
134
  return messages
135
 
136
- def extract(self, text, model_name=MODEL_LIST[0]):
137
  """
138
  Extract knowledge graph in structured format from text.
139
  """
@@ -145,15 +142,17 @@ class LLMGraph:
145
  json_graph = self._generate(messages)
146
  return json_graph
147
  else:
 
 
148
  # Use LightRAG with Azure OpenAI
149
  self.rag.insert(text) # Insert the text into the RAG storage
150
 
151
- # Wait for GRAPHML_FILE to be created
152
- while not os.path.exists(GRAPHML_FILE):
153
  time.sleep(0.1) # Sleep for 0.1 seconds before checking again
154
 
155
  # Extract dict format of the knowledge graph
156
- G = nx.read_graphml(GRAPHML_FILE)
157
 
158
  # Convert the graph to node-link data format
159
  dict_graph = nx.node_link_data(G, edges="edges")
 
1
  import os
2
  import time
 
3
 
4
  import numpy as np
5
  import networkx as nx
 
27
  AZURE_EMBEDDING_DEPLOYMENT = os.environ["AZURE_EMBEDDING_DEPLOYMENT"]
28
  AZURE_EMBEDDING_API_VERSION = os.environ["AZURE_EMBEDDING_API_VERSION"]
29
 
 
 
 
30
  MODEL_LIST = [
31
  "EmergentMethods/Phi-3-mini-128k-instruct-graph",
32
  "OpenAI/GPT-4.1-mini",
 
49
 
50
  if self.rag is None:
51
  self.rag = LightRAG(
52
+ working_dir=self.working_dir,
53
  llm_model_func=self._llm_model_func,
54
  embedding_func=EmbeddingFunc(
55
  embedding_dim=embedding_dimension,
 
75
 
76
  # return True
77
 
78
+ def __init__(self, working_dir):
79
  """
80
  Initialize the Phi3InstructGraph with a specified model.
81
  """
 
87
  )
88
 
89
  self.rag = None # Lazy loading of RAG instance
90
+ self.working_dir = working_dir
91
 
92
  def _generate(self, messages):
93
  """
 
130
 
131
  return messages
132
 
133
+ def extract(self, text, model_name=MODEL_LIST[0], graph_file=""):
134
  """
135
  Extract knowledge graph in structured format from text.
136
  """
 
142
  json_graph = self._generate(messages)
143
  return json_graph
144
  else:
145
+ assert graph_file, "Graph file path cannot be empty or None."
146
+
147
  # Use LightRAG with Azure OpenAI
148
  self.rag.insert(text) # Insert the text into the RAG storage
149
 
150
+ # Wait for the graph file to be created
151
+ while not os.path.exists(graph_file):
152
  time.sleep(0.1) # Sleep for 0.1 seconds before checking again
153
 
154
  # Extract dict format of the knowledge graph
155
+ G = nx.read_graphml(graph_file)
156
 
157
  # Convert the graph to node-link data format
158
  dict_graph = nx.node_link_data(G, edges="edges")