phi-knowledge-graph

Running on Zero

App Files Files Community

vietexob commited on Aug 27

Commit

9021458

1 Parent(s): 855980b

Multiple updates

Browse files

Files changed (4) hide show

README.md +2 -2
app.py +117 -68
app-backup.py → app_old.py +0 -0
phi3_instruct_graph.py +32 -24

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: Graph Mind
-emoji: 👀
 colorFrom: purple
 colorTo: pink
 sdk: gradio

 ---
+title: Text2Graph
+emoji: 🌐
 colorFrom: purple
 colorTo: pink
 sdk: gradio

app.py CHANGED Viewed

@@ -1,19 +1,20 @@
 # import spaces
 import gradio as gr
 from phi3_instruct_graph import Phi3InstructGraph
-import rapidjson
 from pyvis.network import Network
-import networkx as nx
-import spacy
 from spacy import displacy
 from spacy.tokens import Span
-import random
-import os
-import pickle
 # Constants
-TITLE = "🌐 GraphMind: Phi-3 Instruct Graph Explorer"
-SUBTITLE = "✨ Extract and visualize knowledge graphs from any text in multiple languages"
 # Basic CSS for styling
 CUSTOM_CSS = """
@@ -29,40 +30,63 @@ EXAMPLE_CACHE_FILE = os.path.join(CACHE_DIR, "first_example_cache.pkl")
 # Create cache directory if it doesn't exist
 os.makedirs(CACHE_DIR, exist_ok=True)
-# Color utilities
 def get_random_light_color():
     r = random.randint(140, 255)
     g = random.randint(140, 255)
     b = random.randint(140, 255)
     return f"#{r:02x}{g:02x}{b:02x}"
-# Text preprocessing
-def handle_text(text):
     return " ".join(text.split())
-# Main processing functions
 # @spaces.GPU
-def extract(text):
     try:
-        model = Phi3InstructGraph()
         result = model.extract(text)
         return rapidjson.loads(result)
     except Exception as e:
-        raise gr.Error(f"Extraction error: {str(e)}")
 def find_token_indices(doc, substring, text):
     result = []
-    start_index = text.find(substring)
-    while start_index != -1:
-        end_index = start_index + len(substring)
         start_token = None
         end_token = None
         for token in doc:
-            if token.idx == start_index:
                 start_token = token.i
-            if token.idx + len(token) == end_index:
                 end_token = token.i + 1
         if start_token is not None and end_token is not None:
@@ -72,35 +96,41 @@ def find_token_indices(doc, substring, text):
             })
         # Search for next occurrence
-        start_index = text.find(substring, end_index)
     return result
 def create_custom_entity_viz(data, full_text):
     nlp = spacy.blank("xx")
     doc = nlp(full_text)
     spans = []
     colors = {}
     for node in data["nodes"]:
         entity_spans = find_token_indices(doc, node["id"], full_text)
-        for dataentity in entity_spans:
-            start = dataentity["start"]
-            end = dataentity["end"]
             if start < len(doc) and end <= len(doc):
                 # Check for overlapping spans
                 overlapping = any(s.start < end and start < s.end for s in spans)
                 if not overlapping:
                     node_type = node.get("type", "Entity")
                     span = Span(doc, start, end, label=node_type)
                     spans.append(span)
                     if node_type not in colors:
                         colors[node_type] = get_random_light_color()
     doc.set_ents(spans, default="unmodified")
     doc.spans["sc"] = spans
     options = {
         "colors": colors,
         "ents": list(colors.keys()),
@@ -111,24 +141,30 @@ def create_custom_entity_viz(data, full_text):
     html = displacy.render(doc, style="span", options=options)
     # Add custom styling to the entity visualization
     styled_html = f"""
-    <div style="padding: 20px; border-radius: 12px; background-color: white; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);">
         {html}
     </div>
     """
     return styled_html
 def create_graph(json_data):
     G = nx.Graph()
-    # Add nodes with tooltips - with error handling for missing keys
     for node in json_data['nodes']:
         # Get node type with fallback
-        node_type = node.get("type", "Entity")
         # Get detailed type with fallback
-        detailed_type = node.get("detailed_type", node_type)
         # Use node ID and type info for the tooltip
-        G.add_node(node['id'], title=f"{node_type}: {detailed_type}")
     # Add edges with labels
     for edge in json_data['edges']:
@@ -138,18 +174,18 @@ def create_graph(json_data):
             G.add_edge(edge['from'], edge['to'], title=label, label=label)
     # Create network visualization
-    nt = Network(
         width="100%",
         height="700px",
-        directed=True,
         notebook=False,
         bgcolor="#f8fafc",
         font_color="#1e293b"
     )
     # Configure network display
-    nt.from_nx(G)
-    nt.barnes_hut(
         gravity=-3000,
         central_gravity=0.3,
         spring_length=50,
@@ -159,21 +195,21 @@ def create_graph(json_data):
     )
     # Customize edge appearance
-    for edge in nt.edges:
         edge['width'] = 2
-        edge['arrows'] = {'to': {'enabled': True, 'type': 'arrow'}}
         edge['color'] = {'color': '#6366f1', 'highlight': '#4f46e5'}
         edge['font'] = {'size': 12, 'color': '#4b5563', 'face': 'Arial'}
     # Customize node appearance
-    for node in nt.nodes:
         node['color'] = {'background': '#e0e7ff', 'border': '#6366f1', 'highlight': {'background': '#c7d2fe', 'border': '#4f46e5'}}
         node['font'] = {'size': 14, 'color': '#1e293b'}
         node['shape'] = 'dot'
         node['size'] = 25
     # Generate HTML with iframe to isolate styles
-    html = nt.generate_html()
     html = html.replace("'", '"')
     return f"""<iframe style="width: 100%; height: 700px; margin: 0 auto; border-radius: 12px; box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -4px rgba(0, 0, 0, 0.1);"
@@ -183,6 +219,10 @@ def create_graph(json_data):
         allowpaymentrequest="" frameborder="0" srcdoc='{html}'></iframe>"""
 def process_and_visualize(text, progress=gr.Progress()):
     if not text:
         raise gr.Error("⚠️ Text must be provided!")
@@ -200,10 +240,10 @@ def process_and_visualize(text, progress=gr.Progress()):
             return cache_data["graph_html"], cache_data["entities_viz"], cache_data["json_data"], cache_data["stats"]
         except Exception as e:
             print(f"Cache loading error: {str(e)}")
-            # Continue with normal processing if cache fails
     progress(0, desc="Starting extraction...")
-    json_data = extract(text)
     progress(0.5, desc="Creating entity visualization...")
     entities_viz = create_custom_entity_viz(json_data, text)
@@ -232,33 +272,40 @@ def process_and_visualize(text, progress=gr.Progress()):
     progress(1.0, desc="Complete!")
     return graph_html, entities_viz, json_data, stats
-# Example texts in different languages
 EXAMPLES = [
-    [handle_text("""The family of Azerbaijan President Ilham Aliyev leads a charmed, glamorous life, thanks in part to financial interests in almost every sector of the economy. His wife, Mehriban, comes from the privileged and powerful Pashayev family that owns banks, insurance and construction companies, a television station and a line of cosmetics. She has led the Heydar Aliyev Foundation, Azerbaijan’s pre-eminent charity behind the construction of schools, hospitals and the country’s major sports complex. Their eldest daughter, Leyla, editor of Baku magazine, and her sister, Arzu, have financial stakes in a firm that won rights to mine for gold in the western village of Chovdar and Azerfon, the country’s largest mobile phone business. Arzu is also a significant shareholder in SW Holding, which controls nearly every operation related to Azerbaijan Airlines (“Azal”), from meals to airport taxis. Both sisters and brother Heydar own property in Dubai valued at roughly $75 million in 2010; Heydar is the legal owner of nine luxury mansions in Dubai purchased for some $44 million.""")],
-    [handle_text("""Legendary rock band Aerosmith has officially announced their retirement from touring after 54 years, citing
-    lead singer Steven Tyler's unrecoverable vocal cord injury.
-    The decision comes after months of unsuccessful treatment for Tyler's fractured larynx,
-    which he suffered in September 2023.""")],
     [handle_text("""Pop star Justin Timberlake, 43, had his driver's license suspended by a New York judge during a virtual
-    court hearing on August 2, 2024. The suspension follows Timberlake's arrest for driving while intoxicated (DWI)
-    in Sag Harbor on June 18. Timberlake, who is currently on tour in Europe,
-    pleaded not guilty to the charges.""")],
 ]
-# Function to preprocess the first example when the app starts
 def generate_first_example_cache():
-    """Generate cache for the first example if it doesn't exist"""
     if not os.path.exists(EXAMPLE_CACHE_FILE):
         print("Generating cache for first example...")
         try:
             text = EXAMPLES[0][0]
-            # model = MODEL_LIST[0] if MODEL_LIST else None
-            # if model:
             # Extract data
-            json_data = extract(text, model)
             entities_viz = create_custom_entity_viz(json_data, text)
             graph_html = create_graph(json_data)
@@ -267,17 +314,18 @@ def generate_first_example_cache():
             stats = f"📊 Extracted {node_count} entities and {edge_count} relationships"
             # Save to cache
-            cache_data = {
                 "graph_html": graph_html,
                 "entities_viz": entities_viz,
                 "json_data": json_data,
                 "stats": stats
             }
             with open(EXAMPLE_CACHE_FILE, 'wb') as f:
-                pickle.dump(cache_data, f)
             print("First example cache generated successfully")
-            return cache_data
         except Exception as e:
             print(f"Error generating first example cache: {str(e)}")
     else:
@@ -291,6 +339,10 @@ def generate_first_example_cache():
     return None
 def create_ui():
     # Try to generate/load the first example cache
     first_example_cache = generate_first_example_cache()
@@ -299,9 +351,6 @@ def create_ui():
         gr.Markdown(f"# {TITLE}")
         gr.Markdown(f"{SUBTITLE}")
-        with gr.Row():
-            gr.Markdown("🌍 **Multilingual Support Available**")
         # Main content area - redesigned layout
         with gr.Row():
             # Left panel - Input controls
@@ -381,10 +430,10 @@ def create_ui():
         # Footer
         gr.Markdown("---")
-        gr.Markdown("📋 **Instructions:** Enter text in any language, select a model, and click 'Extract & Visualize' to generate a knowledge graph.")
-        gr.Markdown("🛠️ Powered by Phi-3 Instruct Graph | Emergent Methods")
     return demo
 demo = create_ui()
-demo.launch(share=False)

 # import spaces
+import os
+import spacy
+import pickle
+import random
+import rapidjson
 import gradio as gr
+import networkx as nx
 from phi3_instruct_graph import Phi3InstructGraph
 from pyvis.network import Network
 from spacy import displacy
 from spacy.tokens import Span
 # Constants
+TITLE = "🌐 Text2Graph: Extract Knowledge Graphs from Natural Language"
+SUBTITLE = "✨ Extract and visualize knowledge graphs from texts in any language!"
 # Basic CSS for styling
 CUSTOM_CSS = """
 # Create cache directory if it doesn't exist
 os.makedirs(CACHE_DIR, exist_ok=True)
 def get_random_light_color():
+    """
+    Color utilities
+    """
     r = random.randint(140, 255)
     g = random.randint(140, 255)
     b = random.randint(140, 255)
     return f"#{r:02x}{g:02x}{b:02x}"
+def handle_text(text=""):
+    """
+    Text preprocessing
+    """
+    # Catch empty text
+    if not text:
+        return ""
     return " ".join(text.split())
+#
 # @spaces.GPU
+def extract_kg(text=""):
+    """
+    Extract knowledge graph from text
+    """
+    # Catch empty text
+    if not text:
+        raise gr.Error("⚠️ Text must be provided!")
     try:
+        model = Phi3InstructGraph()
         result = model.extract(text)
         return rapidjson.loads(result)
     except Exception as e:
+        raise gr.Error(f"❌ Extraction error: {str(e)}")
 def find_token_indices(doc, substring, text):
+    """
+    Find token indices for a given substring in the text
+    based on the provided spaCy doc.
+    """
     result = []
+    start_idx = text.find(substring)
+    while start_idx != -1:
+        end_idx = start_idx + len(substring)
         start_token = None
         end_token = None
         for token in doc:
+            if token.idx == start_idx:
                 start_token = token.i
+            if token.idx + len(token) == end_idx:
                 end_token = token.i + 1
         if start_token is not None and end_token is not None:
             })
         # Search for next occurrence
+        start_idx = text.find(substring, end_idx)
     return result
 def create_custom_entity_viz(data, full_text):
+    """
+    Create custom entity visualization using spaCy's displacy
+    """
     nlp = spacy.blank("xx")
     doc = nlp(full_text)
     spans = []
     colors = {}
     for node in data["nodes"]:
         entity_spans = find_token_indices(doc, node["id"], full_text)
+        for entity in entity_spans:
+            start = entity["start"]
+            end = entity["end"]
             if start < len(doc) and end <= len(doc):
                 # Check for overlapping spans
                 overlapping = any(s.start < end and start < s.end for s in spans)
                 if not overlapping:
                     node_type = node.get("type", "Entity")
                     span = Span(doc, start, end, label=node_type)
                     spans.append(span)
                     if node_type not in colors:
                         colors[node_type] = get_random_light_color()
     doc.set_ents(spans, default="unmodified")
     doc.spans["sc"] = spans
     options = {
         "colors": colors,
         "ents": list(colors.keys()),
     html = displacy.render(doc, style="span", options=options)
     # Add custom styling to the entity visualization
     styled_html = f"""
+    <div style="padding: 20px; border-radius: 12px; background-color: gray; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);">
         {html}
     </div>
     """
     return styled_html
 def create_graph(json_data):
+    """
+    Create interactive knowledge graph using pyvis
+    """
     G = nx.Graph()
+    # Add nodes with tooltips and error handling for missing keys
     for node in json_data['nodes']:
         # Get node type with fallback
+        type = node.get("type", "Entity")
         # Get detailed type with fallback
+        detailed_type = node.get("detailed_type", type)
         # Use node ID and type info for the tooltip
+        G.add_node(node['id'], title=f"{type}: {detailed_type}")
     # Add edges with labels
     for edge in json_data['edges']:
             G.add_edge(edge['from'], edge['to'], title=label, label=label)
     # Create network visualization
+    network = Network(
         width="100%",
         height="700px",
+        directed=False,
         notebook=False,
         bgcolor="#f8fafc",
         font_color="#1e293b"
     )
     # Configure network display
+    network.from_nx(G)
+    network.barnes_hut(
         gravity=-3000,
         central_gravity=0.3,
         spring_length=50,
     )
     # Customize edge appearance
+    for edge in network.edges:
         edge['width'] = 2
+        edge['arrows'] = {'to': {'enabled': False, 'type': 'arrow'}}
         edge['color'] = {'color': '#6366f1', 'highlight': '#4f46e5'}
         edge['font'] = {'size': 12, 'color': '#4b5563', 'face': 'Arial'}
     # Customize node appearance
+    for node in network.nodes:
         node['color'] = {'background': '#e0e7ff', 'border': '#6366f1', 'highlight': {'background': '#c7d2fe', 'border': '#4f46e5'}}
         node['font'] = {'size': 14, 'color': '#1e293b'}
         node['shape'] = 'dot'
         node['size'] = 25
     # Generate HTML with iframe to isolate styles
+    html = network.generate_html()
     html = html.replace("'", '"')
     return f"""<iframe style="width: 100%; height: 700px; margin: 0 auto; border-radius: 12px; box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -4px rgba(0, 0, 0, 0.1);"
         allowpaymentrequest="" frameborder="0" srcdoc='{html}'></iframe>"""
 def process_and_visualize(text, progress=gr.Progress()):
+    """
+    Process text and visualize knowledge graph and entities
+    """
     if not text:
         raise gr.Error("⚠️ Text must be provided!")
             return cache_data["graph_html"], cache_data["entities_viz"], cache_data["json_data"], cache_data["stats"]
         except Exception as e:
             print(f"Cache loading error: {str(e)}")
+    # Continue with normal processing if cache fails
     progress(0, desc="Starting extraction...")
+    json_data = extract_kg(text)
     progress(0.5, desc="Creating entity visualization...")
     entities_viz = create_custom_entity_viz(json_data, text)
     progress(1.0, desc="Complete!")
     return graph_html, entities_viz, json_data, stats
+# Example texts
 EXAMPLES = [
+    [handle_text("""The family of Azerbaijan President Ilham Aliyev leads a charmed, glamorous life, thanks in part to financial interests in almost every sector of the economy.
+                 His wife, Mehriban, comes from the privileged and powerful Pashayev family that owns banks, insurance and construction companies,
+                 a television station and a line of cosmetics. She has led the Heydar Aliyev Foundation, Azerbaijan’s pre-eminent charity behind the construction of schools,
+                 hospitals and the country’s major sports complex. Their eldest daughter, Leyla, editor of Baku magazine, and her sister, Arzu,
+                 have financial stakes in a firm that won rights to mine for gold in the western village of Chovdar and Azerfon, the country’s largest mobile phone business.
+                 Arzu is also a significant shareholder in SW Holding, which controls nearly every operation related to Azerbaijan Airlines (“Azal”), from meals to airport taxis.
+                 Both sisters and brother Heydar own property in Dubai valued at roughly $75 million in 2010;
+                 Heydar is the legal owner of nine luxury mansions in Dubai purchased for some $44 million.""")],
+    [handle_text("""Legendary rock band Aerosmith has officially announced their retirement from touring after 54 years,
+                 citing lead singer Steven Tyler's unrecoverable vocal cord injury.
+                 The decision comes after months of unsuccessful treatment for Tyler's fractured larynx, which he suffered in September 2023.""")],
     [handle_text("""Pop star Justin Timberlake, 43, had his driver's license suspended by a New York judge during a virtual
+                 court hearing on August 2, 2024. The suspension follows Timberlake's arrest for driving while intoxicated (DWI)
+                 in Sag Harbor on June 18. Timberlake, who is currently on tour in Europe, pleaded not guilty to the charges.""")],
 ]
 def generate_first_example_cache():
+    """
+    Generate cache for the first example if it doesn't exist when the app starts
+    """
     if not os.path.exists(EXAMPLE_CACHE_FILE):
         print("Generating cache for first example...")
         try:
             text = EXAMPLES[0][0]
+            model = Phi3InstructGraph()
             # Extract data
+            json_data = extract_kg(text, model)
             entities_viz = create_custom_entity_viz(json_data, text)
             graph_html = create_graph(json_data)
             stats = f"📊 Extracted {node_count} entities and {edge_count} relationships"
             # Save to cache
+            cached_data = {
                 "graph_html": graph_html,
                 "entities_viz": entities_viz,
                 "json_data": json_data,
                 "stats": stats
             }
             with open(EXAMPLE_CACHE_FILE, 'wb') as f:
+                pickle.dump(cached_data, f)
             print("First example cache generated successfully")
+            return cached_data
         except Exception as e:
             print(f"Error generating first example cache: {str(e)}")
     else:
     return None
 def create_ui():
+    """
+    Create the Gradio UI
+    """
     # Try to generate/load the first example cache
     first_example_cache = generate_first_example_cache()
         gr.Markdown(f"# {TITLE}")
         gr.Markdown(f"{SUBTITLE}")
         # Main content area - redesigned layout
         with gr.Row():
             # Left panel - Input controls
         # Footer
         gr.Markdown("---")
+        gr.Markdown("📋 **Instructions:** Enter text in any language and click 'Extract & Visualize' to generate a knowledge graph.")
+        gr.Markdown("🛠️ Powered by [Phi-3-mini-128k-instruct-graph](https://huggingface.co/EmergentMethods/Phi-3-mini-128k-instruct-graph)")
     return demo
 demo = create_ui()
+demo.launch(share=False)

app-backup.py → app_old.py RENAMED Viewed

File without changes

phi3_instruct_graph.py CHANGED Viewed

@@ -16,10 +16,17 @@ client = InferenceClient(
 class Phi3InstructGraph:
     def __init__(self, model = "EmergentMethods/Phi-3-mini-4k-instruct-graph"):
         self.model_path = model
     def _generate(self, messages):
         # Use the chat_completion method
         response = client.chat_completion(
             messages=messages,
@@ -31,6 +38,10 @@ class Phi3InstructGraph:
         return generated_text
     def _get_messages(self, text):
         context = dedent("""\n
                     A chat between a curious user and an artificial intelligence Assistant. The Assistant is an expert at identifying entities and relationships in text. The Assistant responds in JSON output only.
@@ -66,31 +77,28 @@ class Phi3InstructGraph:
                     -------Text end-------
                     """)
-        if self.model_path == "EmergentMethods/Phi-3-medium-128k-instruct-graph":
-            # model without system message -- why??
-            messages = [
-                {
-                    "role": "user",
-                    "content": f"{context}\n Input: {user_message}",
-                }
-            ]
-            return messages
-        else:
-            messages = [
-                {
-                    "role": "system",
-                    "content": context
-                },
-                {
-                    "role": "user",
-                    "content": user_message
-                }
-            ]
-            return messages
     def extract(self, text):
         messages = self._get_messages(text)
         generated_text = self._generate(messages)
-        # return pipe_output[0]["generated_text"]
         return generated_text

 class Phi3InstructGraph:
     def __init__(self, model = "EmergentMethods/Phi-3-mini-4k-instruct-graph"):
+        """
+        Initialize the Phi3InstructGraph with a specified model.
+        """
         self.model_path = model
     def _generate(self, messages):
+        """
+        Generate a response from the model based on the provided messages.
+        """
         # Use the chat_completion method
         response = client.chat_completion(
             messages=messages,
         return generated_text
     def _get_messages(self, text):
+        """
+        Construct the message list for the chat model.
+        """
         context = dedent("""\n
                     A chat between a curious user and an artificial intelligence Assistant. The Assistant is an expert at identifying entities and relationships in text. The Assistant responds in JSON output only.
                     -------Text end-------
                     """)
+        # if self.model_path == "EmergentMethods/Phi-3-medium-128k-instruct-graph":
+        messages = [
+            {
+                "role": "system",
+                "content": context
+            },
+            {
+                "role": "user",
+                "content": user_message
+            }
+        ]
+        # else:
+        #     # TODO: update for other models
+        return messages
     def extract(self, text):
+        """
+        Extract knowledge graph from text
+        """
         messages = self._get_messages(text)
         generated_text = self._generate(messages)
         return generated_text