Spaces:

abby101
/

surveyor-0

Running

App Files Files Community

Abhipsha Das commited on Dec 15, 2024

Commit

8fbb714

unverified ·

1 Parent(s): 3e39d34

initial spaces deploy

Browse files

Files changed (13) hide show

README.md +12 -7
app.py +14 -0
config.py +445 -0
requirements.txt +8 -0
scripts/__init__.py +6 -0
scripts/__pycache__/__init__.cpython-311.pyc +0 -0
scripts/__pycache__/create_db.cpython-311.pyc +0 -0
scripts/__pycache__/run_db_interface.cpython-311.pyc +0 -0
scripts/__pycache__/run_db_interface_improved.cpython-311.pyc +0 -0
scripts/create_db.py +246 -0
scripts/run_db_interface.py +704 -0
scripts/run_db_interface_basic.py +361 -0
scripts/run_db_interface_js.py +0 -0

README.md CHANGED Viewed

@@ -1,14 +1,19 @@
 ---
-title: Surveyor 0
-emoji: 👀
-colorFrom: yellow
 colorTo: green
 sdk: gradio
-sdk_version: 5.9.0
 app_file: app.py
 pinned: false
-license: openrail
-short_description: Interface for exploring scientific concepts with KGs
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Surveyor
+emoji: 🔍
+colorFrom: blue
 colorTo: green
 sdk: gradio
+sdk_version: 4.40.0
 app_file: app.py
 pinned: false
 ---
+# Surveyor
+An interactive interface for querying and visualizing scientific paper databases with concept co-occurrence graphs.
+## Features
+- Interactive concept co-occurrence graphs
+- SQL query interface with pre-built queries
+- Support for multiple scientific domains
+- Graph filtering and highlighting

app.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+import sys
+# Add the project root directory to Python path
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+if ROOT_DIR not in sys.path:
+    sys.path.insert(0, ROOT_DIR)
+from scripts.run_db_interface import create_demo
+demo = create_demo()
+if __name__ == "__main__":
+    demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,445 @@

+DEFAULT_MODEL_ID = "Meta-Llama-3-70B-Instruct"
+DEFAULT_INTERFACE_MODEL_ID = "NumbersStation/nsql-llama-2-7B"
+DEFAULT_KIND = "json"
+DEFAULT_TEMPERATURE = 0.6
+DEFAULT_TOP_P = 0.95
+DEFAULT_FEW_SHOT_NUM = 3
+DEFAULT_FEW_SHOT_SELECTION = "random"
+DEFAULT_SAVE_INTERVAL = 3
+DEFAULT_RES_DIR = "data/results"
+DEFAULT_LOG_DIR = "logs"
+DEFAULT_TABLES_DIR = "data/databases"
+COOCCURRENCE_QUERY = """
+    WITH concept_pairs AS (
+    SELECT p1.concept AS concept1, p2.concept AS concept2, p1.paper_id, p1.tag_type
+    FROM predictions p1
+    JOIN predictions p2 ON p1.paper_id = p2.paper_id AND p1.concept < p2.concept
+    WHERE p1.tag_type = p2.tag_type
+    )
+    SELECT concept1, concept2, tag_type, COUNT(DISTINCT paper_id) AS co_occurrences
+    FROM concept_pairs
+    GROUP BY concept1, concept2, tag_type
+    HAVING co_occurrences > 5
+    ORDER BY co_occurrences DESC;
+    """
+canned_queries = [
+    (
+        "Modalities in Physics and Astronomy papers",
+        """
+     SELECT DISTINCT LOWER(concept) AS concept
+     FROM predictions
+     JOIN (
+         SELECT paper_id, url
+         FROM papers
+         WHERE primary_category LIKE '%physics.space-ph%'
+            OR primary_category LIKE '%astro-ph.%'
+     ) AS paper_ids
+     ON predictions.paper_id = paper_ids.paper_id
+     WHERE predictions.tag_type = 'modality'
+     """,
+    ),
+    (
+        "Datasets in Evolutionary Biology that use PDEs",
+        """
+    WITH pde_predictions AS (
+    SELECT paper_id, concept AS pde_concept, tag_type AS pde_tag_type
+    FROM predictions
+    WHERE tag_type IN ('method', 'model')
+      AND (
+          LOWER(concept) LIKE '%pde%'
+          OR LOWER(concept) LIKE '%partial differential equation%'
+      )
+    )
+    SELECT DISTINCT
+        papers.paper_id,
+        papers.url,
+        LOWER(p_dataset.concept) AS dataset,
+        pde_predictions.pde_concept AS pde_related_concept,
+        pde_predictions.pde_tag_type AS pde_related_type
+    FROM papers
+    JOIN pde_predictions ON papers.paper_id = pde_predictions.paper_id
+    LEFT JOIN predictions p_dataset ON papers.paper_id = p_dataset.paper_id
+    WHERE papers.primary_category LIKE '%q-bio.PE%'
+    AND (p_dataset.tag_type = 'dataset' OR p_dataset.tag_type IS NULL)
+    ORDER BY papers.paper_id, dataset, pde_related_concept;
+    """,
+    ),
+    (
+        "Trends in objects of study in Cosmology since 2019",
+        """
+        SELECT
+            substr(papers.updated_on, 2, 4) as year,
+            predictions.concept as object,
+            COUNT(DISTINCT papers.paper_id) as paper_count
+        FROM
+            papers
+        JOIN
+            predictions ON papers.paper_id = predictions.paper_id
+        WHERE
+            predictions.tag_type = 'object'
+            AND CAST(SUBSTR(papers.updated_on, 2, 4) AS INTEGER) >= 2019
+        GROUP BY
+            year, object
+        ORDER BY
+            year DESC, paper_count DESC;
+    """,
+    ),
+    (
+        "New datasets in fluid dynamics since 2020",
+        """
+    WITH ranked_datasets AS (
+    SELECT
+        p.paper_id,
+        p.url,
+        pred.concept AS dataset,
+        p.updated_on,
+        ROW_NUMBER() OVER (PARTITION BY pred.concept ORDER BY p.updated_on ASC) AS rn
+    FROM
+        papers p
+    JOIN
+        predictions pred ON p.paper_id = pred.paper_id
+    WHERE
+        pred.tag_type = 'dataset'
+        AND p.primary_category LIKE '%physics.flu-dyn%'
+        AND CAST(SUBSTR(p.updated_on, 2, 4) AS INTEGER) >= 2020
+    )
+    SELECT
+        paper_id,
+        url,
+        dataset,
+        updated_on
+    FROM
+        ranked_datasets
+    WHERE
+        rn = 1
+    ORDER BY
+        updated_on ASC
+    """,
+    ),
+    (
+        "Evolutionary biology datasets that use spatiotemporal dynamics",
+        """
+    WITH evo_bio_papers AS (
+    SELECT paper_id
+    FROM papers
+    WHERE primary_category LIKE '%q-bio.PE%'
+    ),
+    spatiotemporal_keywords AS (
+        SELECT 'spatio-temporal' AS keyword
+        UNION SELECT 'spatiotemporal'
+        UNION SELECT 'spatio-temporal'
+        UNION SELECT 'spatial and temporal'
+        UNION SELECT 'space-time'
+        UNION SELECT 'geographic distribution'
+        UNION SELECT 'phylogeograph'
+        UNION SELECT 'biogeograph'
+        UNION SELECT 'dispersal'
+        UNION SELECT 'migration'
+        UNION SELECT 'range expansion'
+        UNION SELECT 'population dynamics'
+    )
+    SELECT DISTINCT
+        p.paper_id,
+        p.updated_on,
+        p.abstract,
+        d.concept AS dataset,
+        GROUP_CONCAT(DISTINCT stk.keyword) AS spatiotemporal_keywords_found
+    FROM
+        evo_bio_papers ebp
+    JOIN
+        papers p ON ebp.paper_id = p.paper_id
+    JOIN
+        predictions d ON p.paper_id = d.paper_id
+    JOIN
+        predictions st ON p.paper_id = st.paper_id
+    JOIN
+        spatiotemporal_keywords stk
+    WHERE
+        d.tag_type = 'dataset'
+        AND st.tag_type = 'modality'
+        AND LOWER(st.concept) LIKE '%' || stk.keyword || '%'
+    GROUP BY
+        p.paper_id, p.updated_on, p.abstract, d.concept
+    ORDER BY
+        p.updated_on DESC
+    """,
+    ),
+    (
+        "What percentage of papers use only galaxy or spectra, or both or neither?",
+        """
+    WITH paper_modalities AS (
+    SELECT
+        p.paper_id,
+        MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_galaxy_images,
+        MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra
+    FROM
+        papers p
+    LEFT JOIN
+        predictions pred ON p.paper_id = pred.paper_id
+    WHERE
+            p.primary_category LIKE '%astro-ph%'
+            AND pred.tag_type = 'modality'
+        GROUP BY
+            p.paper_id
+    ),
+    categorized_papers AS (
+        SELECT
+            CASE
+                WHEN uses_galaxy_images = 1 AND uses_spectra = 1 THEN 'Both'
+                WHEN uses_galaxy_images = 1 THEN 'Only Galaxy Images'
+                WHEN uses_spectra = 1 THEN 'Only Spectra'
+                ELSE 'Neither'
+            END AS category,
+            COUNT(*) AS paper_count
+        FROM
+            paper_modalities
+        GROUP BY
+            CASE
+                WHEN uses_galaxy_images = 1 AND uses_spectra = 1 THEN 'Both'
+                WHEN uses_galaxy_images = 1 THEN 'Only Galaxy Images'
+                WHEN uses_spectra = 1 THEN 'Only Spectra'
+                ELSE 'Neither'
+            END
+    )
+    SELECT
+        category,
+        paper_count,
+        ROUND(CAST(paper_count AS FLOAT) / (SELECT SUM(paper_count) FROM categorized_papers) * 100, 2) AS percentage
+    FROM
+        categorized_papers
+    ORDER BY
+        paper_count DESC
+    """,
+    ),
+    (
+        "What are all the next highest data modalities after images and spectra?",
+        """
+        SELECT
+            LOWER(concept) AS modality,
+            COUNT(DISTINCT paper_id) AS usage_count
+        FROM
+            predictions
+        WHERE
+            tag_type = 'modality'
+            AND LOWER(concept) NOT LIKE '%imag%'
+            AND LOWER(concept) NOT LIKE '%spectr%'
+        GROUP BY
+            LOWER(concept)
+        ORDER BY
+            usage_count DESC
+        """,
+    ),
+    (
+        "If we include the next biggest data modality, how much does coverage change?",
+        """
+    WITH modality_counts AS (
+    SELECT
+        LOWER(concept) AS modality,
+        COUNT(DISTINCT paper_id) AS usage_count
+    FROM
+        predictions
+    WHERE
+        tag_type = 'modality'
+        AND LOWER(concept) NOT LIKE '%imag%'
+        AND LOWER(concept) NOT LIKE '%spectr%'
+    GROUP BY
+        LOWER(concept)
+    ORDER BY
+        usage_count DESC
+    LIMIT 1
+    ),
+    paper_modalities AS (
+        SELECT
+            p.paper_id,
+            MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_galaxy_images,
+            MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra,
+            MAX(CASE WHEN LOWER(pred.concept) LIKE (SELECT '%' || modality || '%' FROM modality_counts) THEN 1 ELSE 0 END) AS uses_third_modality
+        FROM
+            papers p
+        LEFT JOIN
+            predictions pred ON p.paper_id = pred.paper_id
+        WHERE
+            p.primary_category LIKE '%astro-ph%'
+            AND pred.tag_type = 'modality'
+        GROUP BY
+            p.paper_id
+    ),
+    coverage_before AS (
+        SELECT
+            SUM(CASE WHEN uses_galaxy_images = 1 OR uses_spectra = 1 THEN 1 ELSE 0 END) AS covered_papers,
+            COUNT(*) AS total_papers
+        FROM
+            paper_modalities
+    ),
+    coverage_after AS (
+        SELECT
+            SUM(CASE WHEN uses_galaxy_images = 1 OR uses_spectra = 1 OR uses_third_modality = 1 THEN 1 ELSE 0 END) AS covered_papers,
+            COUNT(*) AS total_papers
+        FROM
+            paper_modalities
+    )
+    SELECT
+        (SELECT modality FROM modality_counts) AS third_modality,
+        ROUND(CAST(covered_papers AS FLOAT) / total_papers * 100, 2) AS coverage_before_percent,
+        ROUND(CAST((SELECT covered_papers FROM coverage_after) AS FLOAT) / total_papers * 100, 2) AS coverage_after_percent,
+        ROUND(CAST((SELECT covered_papers FROM coverage_after) AS FLOAT) / total_papers * 100, 2) -
+        ROUND(CAST(covered_papers AS FLOAT) / total_papers * 100, 2) AS coverage_increase_percent
+    FROM
+        coverage_before
+    """,
+    ),
+    (
+        "Coverage if we select the next 5 highest modalities?",
+        """
+    WITH ranked_modalities AS (
+    SELECT
+        LOWER(concept) AS modality,
+        COUNT(DISTINCT paper_id) AS usage_count,
+        ROW_NUMBER() OVER (ORDER BY COUNT(DISTINCT paper_id) DESC) AS rank
+    FROM
+        predictions
+    WHERE
+        tag_type = 'modality'
+        AND LOWER(concept) NOT LIKE '%imag%'
+        AND LOWER(concept) NOT LIKE '%spectr%'
+    GROUP BY
+        LOWER(concept)
+    ),
+    paper_modalities AS (
+        SELECT
+            p.paper_id,
+            MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_images,
+            MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra,
+            MAX(CASE WHEN rm.rank = 1 THEN 1 ELSE 0 END) AS uses_modality_1,
+            MAX(CASE WHEN rm.rank = 2 THEN 1 ELSE 0 END) AS uses_modality_2,
+            MAX(CASE WHEN rm.rank = 3 THEN 1 ELSE 0 END) AS uses_modality_3,
+            MAX(CASE WHEN rm.rank = 4 THEN 1 ELSE 0 END) AS uses_modality_4,
+            MAX(CASE WHEN rm.rank = 5 THEN 1 ELSE 0 END) AS uses_modality_5
+        FROM
+            papers p
+        LEFT JOIN
+            predictions pred ON p.paper_id = pred.paper_id
+        LEFT JOIN
+            ranked_modalities rm ON LOWER(pred.concept) = rm.modality
+        WHERE
+            p.primary_category LIKE '%astro-ph%'
+            AND pred.tag_type = 'modality'
+        GROUP BY
+            p.paper_id
+    ),
+    cumulative_coverage AS (
+        SELECT
+            'Images and Spectra' AS modalities,
+            0 AS added_modality_rank,
+            SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 THEN 1 ELSE 0 END) AS covered_papers,
+            COUNT(*) AS total_papers
+        FROM
+            paper_modalities
+        UNION ALL
+        SELECT
+            'Images, Spectra, and Modality 1' AS modalities,
+            1 AS added_modality_rank,
+            SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 THEN 1 ELSE 0 END) AS covered_papers,
+            COUNT(*) AS total_papers
+        FROM
+            paper_modalities
+        UNION ALL
+        SELECT
+            'Images, Spectra, Modality 1, and 2' AS modalities,
+            2 AS added_modality_rank,
+            SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 THEN 1 ELSE 0 END) AS covered_papers,
+            COUNT(*) AS total_papers
+        FROM
+            paper_modalities
+        UNION ALL
+        SELECT
+            'Images, Spectra, Modality 1, 2, and 3' AS modalities,
+            3 AS added_modality_rank,
+            SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 THEN 1 ELSE 0 END) AS covered_papers,
+            COUNT(*) AS total_papers
+        FROM
+            paper_modalities
+        UNION ALL
+        SELECT
+            'Images, Spectra, Modality 1, 2, 3, and 4' AS modalities,
+            4 AS added_modality_rank,
+            SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 OR uses_modality_4 = 1 THEN 1 ELSE 0 END) AS covered_papers,
+            COUNT(*) AS total_papers
+        FROM
+            paper_modalities
+        UNION ALL
+        SELECT
+            'Images, Spectra, Modality 1, 2, 3, 4, and 5' AS modalities,
+            5 AS added_modality_rank,
+            SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 OR uses_modality_4 = 1 OR uses_modality_5 = 1 THEN 1 ELSE 0 END) AS covered_papers,
+            COUNT(*) AS total_papers
+        FROM
+            paper_modalities
+        )
+        SELECT
+            cc.modalities,
+            COALESCE(rm.modality, 'N/A') AS added_modality,
+            rm.usage_count AS added_modality_usage,
+            ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2) AS coverage_percent,
+            ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2) -
+            LAG(ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2), 1, 0) OVER (ORDER BY cc.added_modality_rank) AS coverage_increase_percent
+        FROM
+            cumulative_coverage cc
+        LEFT JOIN
+            ranked_modalities rm ON cc.added_modality_rank = rm.rank
+        ORDER BY
+            cc.added_modality_rank
+    """,
+    ),
+    (
+        "List all papers",
+        "SELECT paper_id, abstract AS abstract_preview, authors, primary_category FROM papers",
+    ),
+    (
+        "Count papers by category",
+        "SELECT primary_category, COUNT(*) as paper_count FROM papers GROUP BY primary_category ORDER BY paper_count DESC",
+    ),
+    (
+        "Top authors with most papers",
+        """
+     WITH author_papers AS (
+         SELECT json_each.value AS author
+         FROM papers, json_each(papers.authors)
+     )
+     SELECT author, COUNT(*) as paper_count
+     FROM author_papers
+     GROUP BY author
+     ORDER BY paper_count DESC
+     """,
+    ),
+    (
+        "Papers with 'quantum' in abstract",
+        "SELECT paper_id, abstract AS abstract_preview FROM papers WHERE abstract LIKE '%quantum%'",
+    ),
+    (
+        "Most common concepts",
+        "SELECT concept, COUNT(*) as concept_count FROM predictions GROUP BY concept ORDER BY concept_count DESC",
+    ),
+    (
+        "Papers with multiple authors",
+        """
+     SELECT paper_id, json_array_length(authors) as author_count, authors
+     FROM papers
+     WHERE json_array_length(authors) > 1
+     ORDER BY author_count DESC
+     """,
+    ),
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio==4.40.0
+networkx==3.3
+pandas==2.2.2
+plotly==5.23.0
+tabulate==0.9.0
+fastapi==0.104.1
+pydantic==2.5.3
+uvicorn==0.27.1

scripts/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import os
+import sys
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if ROOT_DIR not in sys.path:
+    sys.path.insert(0, ROOT_DIR)

scripts/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (566 Bytes). View file

scripts/__pycache__/create_db.cpython-311.pyc ADDED Viewed

Binary file (14.6 kB). View file

scripts/__pycache__/run_db_interface.cpython-311.pyc ADDED Viewed

Binary file (29 kB). View file

scripts/__pycache__/run_db_interface_improved.cpython-311.pyc ADDED Viewed

Binary file (29.2 kB). View file

scripts/create_db.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import click
+import json
+import os
+import sqlite3
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from config import DEFAULT_TABLES_DIR, DEFAULT_MODEL_ID, DEFAULT_INTERFACE_MODEL_ID
+from src.processing.generate import get_sentences, generate_prediction
+from src.utils.utils import load_model_and_tokenizer
+class ArxivDatabase:
+    def __init__(self, db_path, model_id=None):
+        self.conn = None
+        self.cursor = None
+        self.db_path = db_path
+        self.model_id = model_id if model_id else DEFAULT_INTERFACE_MODEL_ID
+        self.model = None
+        self.tokenizer = None
+        self.is_db_empty = True
+        self.paper_table = """CREATE TABLE IF NOT EXISTS papers
+                        (paper_id TEXT PRIMARY KEY, abstract TEXT, authors TEXT,
+                        primary_category TEXT, url TEXT, updated_on TEXT, sentence_count INTEGER)"""
+        self.pred_table = """CREATE TABLE IF NOT EXISTS predictions
+                        (id INTEGER PRIMARY KEY AUTOINCREMENT, paper_id TEXT, sentence_index INTEGER,
+                        tag_type TEXT, concept TEXT,
+                        FOREIGN KEY (paper_id) REFERENCES papers(paper_id))"""
+    # def init_db(self):
+    #     self.cursor.execute(self.paper_table)
+    #     self.cursor.execute(self.pred_table)
+    #     print("Database and tables created successfully.")
+    #     self.is_db_empty = self.is_empty()
+    def init_db(self):
+        self.conn = sqlite3.connect(self.db_path)
+        self.cursor = self.conn.cursor()
+        self.cursor.execute(self.paper_table)
+        self.cursor.execute(self.pred_table)
+        self.conn.commit()
+        self.is_db_empty = self.is_empty()
+        if not self.is_db_empty:
+            print("Database already contains data.")
+        else:
+            print("Database and tables created successfully.")
+    def is_empty(self):
+        try:
+            self.cursor.execute("SELECT COUNT(*) FROM papers")
+            count = self.cursor.fetchone()[0]
+            return count == 0
+        except sqlite3.OperationalError:
+            return True
+    def get_connection(self):
+        return sqlite3.connect(self.conn.path)
+    def populate_db(self, data_path, pred_path):
+        papers_info = self._insert_papers(data_path)
+        self._insert_predictions(pred_path, papers_info)
+        print("Database population completed.")
+    def _insert_papers(self, data_path):
+        papers_info = []
+        seen_papers = set()
+        with open(data_path, "r") as f:
+            for line in f:
+                paper = json.loads(line)
+                if paper["id"] in seen_papers:
+                    continue
+                seen_papers.add(paper["id"])
+                sentence_count = len(get_sentences(paper["id"])) + len(
+                    get_sentences(paper["abstract"])
+                )
+                papers_info.append((paper["id"], sentence_count))
+                self.cursor.execute(
+                    """INSERT OR REPLACE INTO papers VALUES (?, ?, ?, ?, ?, ?, ?)""",
+                    (
+                        paper["id"],
+                        paper["abstract"],
+                        json.dumps(paper["authors"]),
+                        json.dumps(paper["primary_category"]),
+                        json.dumps(paper["url"]),
+                        json.dumps(paper["updated"]),
+                        sentence_count,
+                    ),
+                )
+        print(f"Inserted {len(papers_info)} papers.")
+        return papers_info
+    def _insert_predictions(self, pred_path, papers_info):
+        with open(pred_path, "r") as f:
+            predictions = json.load(f)
+            predicted_tags = predictions["predicted_tags"]
+        k = 0
+        papers_with_predictions = set()
+        papers_without_predictions = []
+        for paper_id, sentence_count in papers_info:
+            paper_predictions = predicted_tags[k : k + sentence_count]
+            has_predictions = False
+            for sentence_index, pred in enumerate(paper_predictions):
+                if pred:  # If the prediction is not an empty dictionary
+                    has_predictions = True
+                    for tag_type, concepts in pred.items():
+                        for concept in concepts:
+                            self.cursor.execute(
+                                """INSERT INTO predictions (paper_id, sentence_index, tag_type, concept)
+                                VALUES (?, ?, ?, ?)""",
+                                (paper_id, sentence_index, tag_type, concept),
+                            )
+                else:
+                    # Insert a null prediction to ensure the paper is counted
+                    self.cursor.execute(
+                        """INSERT INTO predictions (paper_id, sentence_index, tag_type, concept)
+                        VALUES (?, ?, ?, ?)""",
+                        (paper_id, sentence_index, "null", "null"),
+                    )
+            if has_predictions:
+                papers_with_predictions.add(paper_id)
+            else:
+                papers_without_predictions.append(paper_id)
+            k += sentence_count
+        print(f"Inserted predictions for {len(papers_with_predictions)} papers.")
+        print(f"Papers without any predictions: {len(papers_without_predictions)}")
+        if k < len(predicted_tags):
+            print(f"Warning: {len(predicted_tags) - k} predictions were not inserted.")
+    def load_model(self):
+        if self.model is None:
+            try:
+                self.model, self.tokenizer = load_model_and_tokenizer(self.model_id)
+                return f"Model {self.model_id} loaded successfully."
+            except Exception as e:
+                return f"Error loading model: {str(e)}"
+        else:
+            return "Model is already loaded."
+    def natural_language_to_sql(self, question):
+        system_prompt = "You are an assistant who converts natural language questions to SQL queries to query a database of scientific papers."
+        table = self.paper_table + "; " + self.pred_table
+        prefix = (
+            f"[INST] Write SQLite query to answer the following question given the database schema. Please wrap your code answer using "
+            f"```: Schema: {table} Question: {question}[/INST] Here is the SQLite query to answer to the question: {question}: ``` "
+        )
+        sql_query = generate_prediction(
+            self.model, self.tokenizer, prefix, question, "sql", system_prompt
+        )
+        sql_query = sql_query.split("```")[1]
+        return sql_query
+    def execute_query(self, sql_query):
+        try:
+            self.cursor.execute(sql_query)
+            results = self.cursor.fetchall()
+            return results if results else []
+        except sqlite3.Error as e:
+            return [(f"An error occurred: {e}",)]
+    def query_db(self, question, is_sql):
+        if self.is_db_empty:
+            return "The database is empty. Please populate it with data first."
+        try:
+            if is_sql:
+                sql_query = question.strip()
+            else:
+                nl_to_sql = self.natural_language_to_sql(question)
+                sql_query = nl_to_sql.replace("```sql", "").replace("```", "").strip()
+            results = self.execute_query(sql_query)
+            output = f"SQL Query: {sql_query}\n\nResults:\n"
+            if isinstance(results, list):
+                if len(results) > 0:
+                    for row in results:
+                        output += str(row) + "\n"
+                else:
+                    output += "No results found."
+            else:
+                output += str(results)  # In case of an error message
+            return output
+        except Exception as e:
+            return f"An error occurred: {str(e)}"
+    def close(self):
+        self.conn.commit()
+        self.conn.close()
+def check_db_exists(db_path):
+    return os.path.exists(db_path) and os.path.getsize(db_path) > 0
+@click.command()
+@click.option(
+    "--data_path", help="Path to the data file containing the papers information."
+)
+@click.option("--pred_path", help="Path to the predictions file.")
+@click.option("--db_name", default="arxiv.db", help="Name of the database to create.")
+@click.option(
+    "--force", is_flag=True, help="Force overwrite if database already exists"
+)
+def main(data_path, pred_path, db_name, force):
+    ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    tables_dir = os.path.join(ROOT, DEFAULT_TABLES_DIR)
+    os.makedirs(tables_dir, exist_ok=True)
+    db_path = os.path.join(tables_dir, db_name)
+    db_exists = check_db_exists(db_path)
+    db = ArxivDatabase(db_path)
+    db.init_db()
+    if db_exists and not db.is_db_empty:
+        if not force:
+            print(f"Warning: The database '{db_name}' already exists and is not empty.")
+            overwrite = input("Do you want to overwrite it? (y/N): ").lower().strip()
+            if overwrite != "y":
+                print("Operation cancelled.")
+                db.close()
+                return
+        else:
+            print(
+                f"Warning: Overwriting existing database '{db_name}' due to --force flag."
+            )
+    db.populate_db(data_path, pred_path)
+    db.close()
+    print(f"Database created and populated at: {db_path}")
+if __name__ == "__main__":
+    main()

scripts/run_db_interface.py ADDED Viewed

	@@ -0,0 +1,704 @@

+import gradio as gr
+import os
+import json
+import networkx as nx
+import pandas as pd
+import plotly.graph_objects as go
+import re
+import sys
+import sqlite3
+import tempfile
+import time
+import uvicorn
+from contextlib import contextmanager
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from gradio.routes import mount_gradio_app
+from plotly.subplots import make_subplots
+from tabulate import tabulate
+from typing import Optional
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if ROOT_DIR not in sys.path:
+    sys.path.insert(0, ROOT_DIR)
+from scripts.create_db import ArxivDatabase
+from config import (
+    DEFAULT_TABLES_DIR,
+    DEFAULT_INTERFACE_MODEL_ID,
+    COOCCURRENCE_QUERY,
+    canned_queries,
+)
+app = FastAPI()
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+db: Optional[ArxivDatabase] = None
+last_update_time = 0
+update_delay = 0.5  # Delay in seconds
+def truncate_or_wrap_text(text, max_length=50, wrap=False):
+    """Truncate text to a maximum length, adding ellipsis if truncated, or wrap if specified."""
+    if wrap:
+        return "\n".join(
+            text[i : i + max_length] for i in range(0, len(text), max_length)
+        )
+    return text[:max_length] + "..." if len(text) > max_length else text
+def format_url(url):
+    """Format URL to be more compact in the table."""
+    return url.split("/")[-1] if url.startswith("http") else url
+def get_db_path():
+    """Get the database directory path based on environment"""
+    # First try local path
+    ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    tables_dir = os.path.join(ROOT, DEFAULT_TABLES_DIR)
+    if not os.path.exists(tables_dir):
+        # If running on Spaces, try the root directory
+        tables_dir = os.path.join(ROOT, "data", "databases")
+        if not os.path.exists(tables_dir):
+            print(f"No database directory found")
+            return None
+    print(f"Using database directory: {tables_dir}")
+    return tables_dir
+def get_available_databases():
+    """Get available databases from either local path or Hugging Face cache."""
+    tables_dir = get_db_path()
+    if not tables_dir:
+        return []
+    files = os.listdir(tables_dir)
+    print(f"All files found: {files}")
+    # Include all files except .md files
+    databases = [f for f in files if not f.endswith(".md")]
+    print(f"Database files: {databases}")
+    return databases
+def query_db(query, is_sql, limit=None, wrap=False):
+    global db
+    if db is None:
+        return pd.DataFrame({"Error": ["Please load a database first."]})
+    try:
+        with sqlite3.connect(db.db_path) as conn:
+            cursor = conn.cursor()
+            query = " ".join(query.strip().split("\n")).rstrip(";")
+            if limit is not None:
+                if "LIMIT" in query.upper():
+                    # Replace existing LIMIT clause
+                    query = re.sub(
+                        r"LIMIT\s+\d+", f"LIMIT {limit}", query, flags=re.IGNORECASE
+                    )
+                else:
+                    query += f" LIMIT {limit}"
+            cursor.execute(query)
+            column_names = [description[0] for description in cursor.description]
+            results = cursor.fetchall()
+        df = pd.DataFrame(results, columns=column_names)
+        for column in df.columns:
+            if df[column].dtype == "object":
+                df[column] = df[column].apply(
+                    lambda x: (
+                        format_url(x)
+                        if column == "url"
+                        else truncate_or_wrap_text(x, wrap=wrap)
+                    )
+                )
+        return df
+    except sqlite3.Error as e:
+        return pd.DataFrame({"Error": [f"Database error: {str(e)}"]})
+    except Exception as e:
+        return pd.DataFrame({"Error": [f"An unexpected error occurred: {str(e)}"]})
+def generate_concept_cooccurrence_graph(db_path, tag_type=None):
+    conn = sqlite3.connect(db_path)
+    query = COOCCURRENCE_QUERY
+    if tag_type and tag_type != "All":
+        query = query.replace(
+            "WHERE p1.tag_type = p2.tag_type",
+            f"WHERE p1.tag_type = p2.tag_type AND p1.tag_type = '{tag_type}'",
+        )
+    df = pd.read_sql_query(query, conn)
+    conn.close()
+    G = nx.from_pandas_edgelist(df, "concept1", "concept2", "co_occurrences")
+    pos = nx.spring_layout(G, k=0.5, iterations=50)
+    edge_trace = go.Scatter(
+        x=[], y=[], line=dict(width=0.5, color="#888"), hoverinfo="none", mode="lines"
+    )
+    node_trace = go.Scatter(
+        x=[],
+        y=[],
+        mode="markers",
+        hoverinfo="text",
+        marker=dict(
+            showscale=True,
+            colorscale="YlGnBu",
+            size=10,
+            colorbar=dict(
+                thickness=15,
+                title="Node Connections",
+                xanchor="left",
+                titleside="right",
+            ),
+        ),
+    )
+    def update_traces(selected_node=None, depth=0):
+        nonlocal edge_trace, node_trace
+        if selected_node and depth > 0:
+            nodes_to_show = set([selected_node])
+            frontier = set([selected_node])
+            for _ in range(depth):
+                new_frontier = set()
+                for node in frontier:
+                    new_frontier.update(G.neighbors(node))
+                nodes_to_show.update(new_frontier)
+                frontier = new_frontier
+            sub_G = G.subgraph(nodes_to_show)
+        else:
+            sub_G = G
+        edge_x, edge_y = [], []
+        for edge in sub_G.edges():
+            x0, y0 = pos[edge[0]]
+            x1, y1 = pos[edge[1]]
+            edge_x.extend([x0, x1, None])
+            edge_y.extend([y0, y1, None])
+        edge_trace.x = edge_x
+        edge_trace.y = edge_y
+        node_x, node_y = [], []
+        for node in sub_G.nodes():
+            x, y = pos[node]
+            node_x.append(x)
+            node_y.append(y)
+        node_trace.x = node_x
+        node_trace.y = node_y
+        node_adjacencies = []
+        node_text = []
+        for node in sub_G.nodes():
+            adjacencies = list(G.adj[node])
+            node_adjacencies.append(len(adjacencies))
+            node_text.append(f"{node}<br># of connections: {len(adjacencies)}")
+        node_trace.marker.color = node_adjacencies
+        node_trace.text = node_text
+    update_traces()
+    fig = go.Figure(
+        data=[edge_trace, node_trace],
+        layout=go.Layout(
+            title=f'Concept Co-occurrence Network {f"({tag_type})" if tag_type and tag_type != "All" else ""}',
+            titlefont_size=16,
+            showlegend=False,
+            hovermode="closest",
+            margin=dict(b=20, l=5, r=5, t=40),
+            annotations=[
+                dict(
+                    text="",
+                    showarrow=False,
+                    xref="paper",
+                    yref="paper",
+                    x=0.005,
+                    y=-0.002,
+                )
+            ],
+            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        ),
+    )
+    fig.update_layout(
+        updatemenus=[
+            dict(
+                type="buttons",
+                direction="left",
+                buttons=[
+                    dict(
+                        args=[{"visible": [True, True]}],
+                        label="Full Graph",
+                        method="update",
+                    ),
+                    dict(
+                        args=[
+                            {
+                                "visible": [True, True],
+                                "xaxis.range": [-1, 1],
+                                "yaxis.range": [-1, 1],
+                            }
+                        ],
+                        label="Core View",
+                        method="relayout",
+                    ),
+                    dict(
+                        args=[
+                            {
+                                "visible": [True, True],
+                                "xaxis.range": [-0.2, 0.2],
+                                "yaxis.range": [-0.2, 0.2],
+                            }
+                        ],
+                        label="Detailed View",
+                        method="relayout",
+                    ),
+                ],
+                pad={"r": 10, "t": 10},
+                showactive=True,
+                x=0.11,
+                xanchor="left",
+                y=1.1,
+                yanchor="top",
+            ),
+        ]
+    )
+    return fig, G, pos, update_traces
+def load_database_with_graphs(db_name):
+    """Load database from either local path or Hugging Face cache."""
+    global db
+    tables_dir = get_db_path()
+    if not tables_dir:
+        return f"No database directory found.", None
+    db_path = os.path.join(tables_dir, db_name)
+    if not os.path.exists(db_path):
+        return f"Database {db_name} does not exist.", None
+    db = ArxivDatabase(db_path)
+    db.init_db()
+    if db.is_db_empty:
+        return (
+            f"Database loaded from {db_path}, but it is empty. Please populate it with data.",
+            None,
+        )
+    graph, _, _, _ = generate_concept_cooccurrence_graph(db_path)
+    return f"Database loaded from {db_path}", graph
+css = """
+#selected-query {
+    max-height: 100px;
+    overflow-y: auto;
+    white-space: pre-wrap;
+    word-break: break-word;
+}
+"""
+def create_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# ArXiv Database Query Interface")
+        with gr.Row():
+            db_dropdown = gr.Dropdown(
+                choices=get_available_databases(),
+                label="Select Database",
+                value=get_available_databases(),
+            )
+            # load_db_btn = gr.Button("Load Database", size="sm")
+            status = gr.Textbox(label="Status")
+        with gr.Row():
+            graph_output = gr.Plot(label="Concept Co-occurrence Graph")
+        with gr.Row():
+            tag_type_dropdown = gr.Dropdown(
+                choices=[
+                    "All",
+                    "model",
+                    "task",
+                    "dataset",
+                    "field",
+                    "modality",
+                    "method",
+                    "object",
+                    "property",
+                    "instrument",
+                ],
+                label="Select Tag Type",
+                value="All",
+            )
+            highlight_input = gr.Textbox(label="Highlight Concepts (comma-separated)")
+        with gr.Row():
+            node_dropdown = gr.Dropdown(label="Select Node", choices=[])
+            depth_slider = gr.Slider(
+                minimum=0, maximum=5, step=1, value=0, label="Connection Depth"
+            )
+            update_graph_button = gr.Button("Update Graph")
+        with gr.Row():
+            wrap_checkbox = gr.Checkbox(label="Wrap long text", value=False)
+            canned_query_dropdown = gr.Dropdown(
+                choices=[q[0] for q in canned_queries], label="Select Query", scale=3
+            )
+            limit_input = gr.Number(
+                label="Limit", value=10000, step=1, minimum=1, scale=1
+            )
+            selected_query = gr.Textbox(
+                label="Selected Query",
+                interactive=False,
+                scale=2,
+                show_label=True,
+                show_copy_button=True,
+                elem_id="selected-query",
+            )
+            canned_query_submit = gr.Button("Submit Query", size="sm", scale=1)
+        with gr.Row():
+            sql_input = gr.Textbox(label="Custom SQL Query", lines=3, scale=4)
+            sql_submit = gr.Button("Submit Custom SQL", size="sm", scale=1)
+        # with gr.Row():
+        #     nl_query_input = gr.Textbox(
+        #         label="Natural Language Query", lines=2, scale=4
+        #     )
+        #     nl_query_submit = gr.Button("Convert to SQL", size="sm", scale=1)
+        output = gr.DataFrame(label="Results", wrap=True)
+        with gr.Row():
+            copy_button = gr.Button("Copy as Markdown")
+            download_button = gr.Button("Download as CSV")
+        def debounced_update_graph(
+            db_name, tag_type, highlight_concepts, selected_node, depth
+        ):
+            global last_update_time
+            current_time = time.time()
+            if current_time - last_update_time < update_delay:
+                return None, []  # Return early if not enough time has passed
+            last_update_time = current_time
+            if not db_name:
+                return None, []
+            ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+            db_path = os.path.join(ROOT, DEFAULT_TABLES_DIR, db_name)
+            fig, G, pos, update_traces = generate_concept_cooccurrence_graph(
+                db_path, tag_type
+            )
+            if isinstance(selected_node, list):
+                selected_node = selected_node[0] if selected_node else None
+            highlight_nodes = (
+                [node.strip() for node in highlight_concepts.split(",")]
+                if highlight_concepts
+                else []
+            )
+            primary_node = highlight_nodes[0] if highlight_nodes else None
+            if primary_node and primary_node in G.nodes():
+                # Apply node selection and depth filter
+                nodes_to_show = set([primary_node])
+                if depth > 0:
+                    frontier = set([primary_node])
+                    for _ in range(depth):
+                        new_frontier = set()
+                        for node in frontier:
+                            new_frontier.update(G.neighbors(node))
+                        nodes_to_show.update(new_frontier)
+                        frontier = new_frontier
+                sub_G = G.subgraph(nodes_to_show)
+                # Update traces with the filtered graph
+                edge_x, edge_y = [], []
+                for edge in sub_G.edges():
+                    x0, y0 = pos[edge[0]]
+                    x1, y1 = pos[edge[1]]
+                    edge_x.extend([x0, x1, None])
+                    edge_y.extend([y0, y1, None])
+                fig.data[0].x = edge_x
+                fig.data[0].y = edge_y
+                node_x, node_y = [], []
+                for node in sub_G.nodes():
+                    x, y = pos[node]
+                    node_x.append(x)
+                    node_y.append(y)
+                fig.data[1].x = node_x
+                fig.data[1].y = node_y
+                # Color nodes based on their distance from the primary node and highlight status
+                node_colors = []
+                node_sizes = []
+                for node in sub_G.nodes():
+                    if node in highlight_nodes:
+                        node_colors.append(
+                            "rgba(255,0,0,1)"
+                        )  # Red for highlighted nodes
+                        node_sizes.append(15)
+                    else:
+                        distance = nx.shortest_path_length(
+                            sub_G, source=primary_node, target=node
+                        )
+                        intensity = max(0, 1 - (distance / (depth + 1)))
+                        node_colors.append(f"rgba(0,0,255,{intensity})")
+                        node_sizes.append(10)
+                fig.data[1].marker.color = node_colors
+                fig.data[1].marker.size = node_sizes
+                # Update node text
+                node_text = [
+                    f"{node}<br># of connections: {len(list(G.neighbors(node)))}"
+                    for node in sub_G.nodes()
+                ]
+                fig.data[1].text = node_text
+                # Get connected nodes for dropdown
+                connected_nodes = sorted(list(G.neighbors(primary_node)))
+            else:
+                # If no primary node or it's not in the graph, show the full graph
+                connected_nodes = sorted(list(G.nodes()))
+            return fig, connected_nodes
+        def update_node_dropdown(highlight_concepts):
+            if not highlight_concepts or not db:
+                return gr.Dropdown(choices=[])
+            ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+            db_path = os.path.join(ROOT, DEFAULT_TABLES_DIR, db.db_path)
+            _, G, _, _ = generate_concept_cooccurrence_graph(db_path)
+            primary_node = highlight_concepts.split(",")[0].strip()
+            if primary_node in G.nodes():
+                connected_nodes = sorted(list(G.neighbors(primary_node)))
+                return gr.Dropdown(choices=connected_nodes)
+            else:
+                return gr.Dropdown(choices=[])
+        def update_selected_query(query_description):
+            for desc, sql in canned_queries:
+                if desc == query_description:
+                    return sql
+            return ""
+        def submit_canned_query(query_description, limit, wrap):
+            for desc, sql in canned_queries:
+                if desc == query_description:
+                    return query_db(sql, True, limit, wrap)
+            return pd.DataFrame({"Error": ["Selected query not found."]})
+        def copy_as_markdown(df):
+            return df.to_markdown()
+        def download_as_csv(df):
+            if df is None or df.empty:
+                return None
+            with tempfile.NamedTemporaryFile(
+                mode="w", delete=False, suffix=".csv"
+            ) as temp_file:
+                df.to_csv(temp_file.name, index=False)
+                temp_file_path = temp_file.name
+            return temp_file_path
+        # def nl_to_sql(nl_query):
+        #     # Placeholder function for natural language to SQL conversion
+        #     return f"SELECT * FROM papers WHERE abstract LIKE '%{nl_query}%' LIMIT 10;"
+        db_dropdown.change(
+            load_database_with_graphs,
+            inputs=[db_dropdown],
+            outputs=[status, graph_output],
+        )
+        # db_dropdown.change(
+        #     debounced_update_graph,
+        #     inputs=[db_dropdown, tag_type_dropdown, highlight_input, node_dropdown, depth_slider],
+        #     outputs=[graph_output, node_dropdown],
+        # )
+        tag_type_dropdown.change(
+            debounced_update_graph,
+            inputs=[
+                db_dropdown,
+                tag_type_dropdown,
+                highlight_input,
+                node_dropdown,
+                depth_slider,
+            ],
+            outputs=[graph_output, node_dropdown],
+        )
+        highlight_input.change(
+            update_node_dropdown,
+            inputs=[highlight_input],
+            outputs=[node_dropdown],
+        )
+        # node_dropdown.change(
+        #     debounced_update_graph,
+        #     inputs=[db_dropdown, tag_type_dropdown, highlight_input, node_dropdown, depth_slider],
+        #     outputs=[graph_output, node_dropdown],
+        # )
+        # depth_slider.change(
+        #     debounced_update_graph,
+        #     inputs=[db_dropdown, tag_type_dropdown, highlight_input, node_dropdown, depth_slider],
+        #     outputs=[graph_output, node_dropdown],
+        # )
+        update_graph_button.click(
+            debounced_update_graph,
+            inputs=[
+                db_dropdown,
+                tag_type_dropdown,
+                highlight_input,
+                node_dropdown,
+                depth_slider,
+            ],
+            outputs=[graph_output, node_dropdown],
+        )
+        canned_query_dropdown.change(
+            update_selected_query,
+            inputs=[canned_query_dropdown],
+            outputs=[selected_query],
+        )
+        canned_query_submit.click(
+            submit_canned_query,
+            inputs=[canned_query_dropdown, limit_input, wrap_checkbox],
+            outputs=output,
+        )
+        sql_submit.click(
+            query_db,
+            inputs=[sql_input, gr.Checkbox(value=True), limit_input, wrap_checkbox],
+            outputs=output,
+        )
+        copy_button.click(
+            copy_as_markdown,
+            inputs=[output],
+            outputs=[gr.Textbox(label="Markdown Output", show_copy_button=True)],
+        )
+        download_button.click(
+            download_as_csv, inputs=[output], outputs=[gr.File(label="CSV Output")]
+        )
+        # nl_query_submit.click(nl_to_sql, inputs=[nl_query_input], outputs=[sql_input])
+    return demo
+demo = create_demo()
+def close_db():
+    global db
+    if db is not None:
+        db.close()
+        db = None
+def launch():
+    print("Launching Gradio app...", flush=True)
+    shared_demo = demo.launch(share=True, prevent_thread_lock=True)
+    if isinstance(shared_demo, tuple):
+        if len(shared_demo) >= 2:
+            local_url, share_url = shared_demo[:2]
+        else:
+            local_url, share_url = shared_demo[0], "N/A"
+    else:
+        local_url = getattr(shared_demo, "local_url", "N/A")
+        share_url = getattr(shared_demo, "share_url", "N/A")
+    print(f"Local URL: {local_url}", flush=True)
+    print(f"Shareable link: {share_url}", flush=True)
+    print(
+        "Gradio app launched.",
+        flush=True,
+    )
+    # Keep the script running
+    demo.block_thread()
+if __name__ == "__main__":
+    launch()
+# Mount the Gradio app
+# app = mount_gradio_app(app, demo, path="/")
+# print(f"Shareable link: {demo.share_url}")
+# @app.exception_handler(Exception)
+# async def exception_handler(request: Request, exc: Exception):
+#     print(f"An error occurred: {str(exc)}")
+#     return {"error": str(exc)}
+# @contextmanager
+# def get_db_connection():
+#     global db
+#     conn = db.conn.cursor().connection
+#     try:
+#         yield conn
+#     finally:
+#         conn.close()
+# @app.on_event("startup")
+# async def startup_event():
+#     global db
+#     ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+#     db_path = os.path.join(ROOT, DEFAULT_TABLES_DIR, get_available_databases()[0])  # Use the first available database
+#     db = ArxivDatabase(db_path)
+#     db.init_db()
+# @app.on_event("shutdown")
+# async def shutdown_event():
+#     if db is not None:
+#         db.close()
+# if __name__ == "__main__":
+#     uvicorn.run(app, host="0.0.0.0", port=7860)

scripts/run_db_interface_basic.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import gradio as gr
+import os
+import json
+import networkx as nx
+import pandas as pd
+import plotly.graph_objects as go
+import re
+import sys
+import sqlite3
+import time
+import uvicorn
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from gradio.routes import mount_gradio_app
+from plotly.subplots import make_subplots
+from tabulate import tabulate
+from typing import Optional
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if ROOT_DIR not in sys.path:
+    sys.path.insert(0, ROOT_DIR)
+from scripts.create_db import ArxivDatabase
+from config import (
+    DEFAULT_TABLES_DIR,
+    DEFAULT_INTERFACE_MODEL_ID,
+    COOCCURRENCE_QUERY,
+    canned_queries,
+)
+app = FastAPI()
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+db: Optional[ArxivDatabase] = None
+def truncate_or_wrap_text(text, max_length=50, wrap=False):
+    """Truncate text to a maximum length, adding ellipsis if truncated, or wrap if specified."""
+    if wrap:
+        return "\n".join(
+            text[i : i + max_length] for i in range(0, len(text), max_length)
+        )
+    return text[:max_length] + "..." if len(text) > max_length else text
+def format_url(url):
+    """Format URL to be more compact in the table."""
+    return url.split("/")[-1] if url.startswith("http") else url
+def get_available_databases():
+    ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    tables_dir = os.path.join(ROOT, DEFAULT_TABLES_DIR)
+    return [f for f in os.listdir(tables_dir) if f.endswith(".db")]
+def query_db(query, is_sql, limit=None, wrap=False):
+    global db
+    if db is None:
+        return pd.DataFrame({"Error": ["Please load a database first."]})
+    try:
+        cursor = db.conn.cursor()
+        query = " ".join(query.strip().split("\n")).rstrip(";")
+        if limit is not None:
+            if "LIMIT" in query.upper():
+                # Replace existing LIMIT clause
+                query = re.sub(
+                    r"LIMIT\s+\d+", f"LIMIT {limit}", query, flags=re.IGNORECASE
+                )
+            else:
+                query += f" LIMIT {limit}"
+        cursor.execute(query)
+        column_names = [description[0] for description in cursor.description]
+        results = cursor.fetchall()
+        df = pd.DataFrame(results, columns=column_names)
+        for column in df.columns:
+            if df[column].dtype == "object":
+                df[column] = df[column].apply(
+                    lambda x: (
+                        format_url(x)
+                        if column == "url"
+                        else truncate_or_wrap_text(x, wrap=wrap)
+                    )
+                )
+        return df
+    except sqlite3.Error as e:
+        return pd.DataFrame({"Error": [f"Database error: {str(e)}"]})
+    except Exception as e:
+        return pd.DataFrame({"Error": [f"An unexpected error occurred: {str(e)}"]})
+def generate_concept_cooccurrence_graph(db_path):
+    conn = sqlite3.connect(db_path)
+    df = pd.read_sql_query(COOCCURRENCE_QUERY, conn)
+    conn.close()
+    G = nx.from_pandas_edgelist(df, "concept1", "concept2", "co_occurrences")
+    pos = nx.spring_layout(G)
+    edge_x = []
+    edge_y = []
+    for edge in G.edges():
+        x0, y0 = pos[edge[0]]
+        x1, y1 = pos[edge[1]]
+        edge_x.extend([x0, x1, None])
+        edge_y.extend([y0, y1, None])
+    edge_trace = go.Scatter(
+        x=edge_x,
+        y=edge_y,
+        line=dict(width=0.5, color="#888"),
+        hoverinfo="none",
+        mode="lines",
+    )
+    node_x = [pos[node][0] for node in G.nodes()]
+    node_y = [pos[node][1] for node in G.nodes()]
+    node_trace = go.Scatter(
+        x=node_x,
+        y=node_y,
+        mode="markers",
+        hoverinfo="text",
+        marker=dict(
+            showscale=True,
+            colorscale="YlGnBu",
+            size=10,
+            colorbar=dict(
+                thickness=15,
+                title="Node Connections",
+                xanchor="left",
+                titleside="right",
+            ),
+        ),
+    )
+    node_adjacencies = []
+    node_text = []
+    for node, adjacencies in G.adjacency():
+        node_adjacencies.append(len(adjacencies))
+        node_text.append(f"{node}<br># of connections: {len(adjacencies)}")
+    node_trace.marker.color = node_adjacencies
+    node_trace.text = node_text
+    fig = go.Figure(
+        data=[edge_trace, node_trace],
+        layout=go.Layout(
+            title="Concept Co-occurrence Network",
+            titlefont_size=16,
+            showlegend=False,
+            hovermode="closest",
+            margin=dict(b=20, l=5, r=5, t=40),
+            annotations=[
+                dict(
+                    text="",
+                    showarrow=False,
+                    xref="paper",
+                    yref="paper",
+                    x=0.005,
+                    y=-0.002,
+                )
+            ],
+            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        ),
+    )
+    return fig
+# def load_database_with_graphs(db_name):
+#     global db
+#     ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+#     db_path = os.path.join(ROOT, DEFAULT_TABLES_DIR, db_name)
+#     if not os.path.exists(db_path):
+#         return f"Database {db_name} does not exist.", None
+#     db = ArxivDatabase(db_path)
+#     db.init_db()
+#     if db.is_db_empty:
+#         return (
+#             f"Database loaded from {db_path}, but it is empty. Please populate it with data.",
+#             None,
+#         )
+#     # Generate graph
+#     graph = generate_concept_cooccurrence_graph(db_path)
+#     return f"Database loaded from {db_path}", graph
+def load_database_with_graphs(db_name):
+    global db
+    ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    db_path = os.path.join(ROOT, DEFAULT_TABLES_DIR, db_name)
+    if not os.path.exists(db_path):
+        return f"Database {db_name} does not exist.", None
+    if db is None or db.db_path != db_path:
+        db = ArxivDatabase(db_path)
+        db.init_db()
+    if db.is_db_empty:
+        return (
+            f"Database loaded from {db_path}, but it is empty. Please populate it with data.",
+            None,
+        )
+    graph = generate_concept_cooccurrence_graph(db_path)
+    return f"Database loaded from {db_path}", graph
+css = """
+#selected-query {
+    max-height: 100px;
+    overflow-y: auto;
+    white-space: pre-wrap;
+    word-break: break-word;
+}
+"""
+def create_demo():
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown("# ArXiv Database Query Interface")
+        with gr.Row():
+            db_dropdown = gr.Dropdown(
+                choices=get_available_databases(), label="Select Database"
+            )
+            load_db_btn = gr.Button("Load Database", size="sm")
+            status = gr.Textbox(label="Status")
+        with gr.Row():
+            graph_output = gr.Plot(label="Concept Co-occurrence Graph")
+        with gr.Row():
+            wrap_checkbox = gr.Checkbox(label="Wrap long text", value=False)
+            canned_query_dropdown = gr.Dropdown(
+                choices=[q[0] for q in canned_queries], label="Select Query", scale=3
+            )
+            limit_input = gr.Number(
+                label="Limit", value=10000, step=1, minimum=1, scale=1
+            )
+            selected_query = gr.Textbox(
+                label="Selected Query",
+                interactive=False,
+                scale=2,
+                show_label=True,
+                show_copy_button=True,
+                elem_id="selected-query",
+            )
+            canned_query_submit = gr.Button("Submit Query", size="sm", scale=1)
+        with gr.Row():
+            sql_input = gr.Textbox(label="Custom SQL Query", lines=3, scale=4)
+            sql_submit = gr.Button("Submit Custom SQL", size="sm", scale=1)
+        output = gr.DataFrame(label="Results", wrap=True)
+        def update_selected_query(query_description):
+            for desc, sql in canned_queries:
+                if desc == query_description:
+                    return sql
+            return ""
+        def submit_canned_query(query_description, limit, wrap):
+            for desc, sql in canned_queries:
+                if desc == query_description:
+                    return query_db(sql, True, limit, wrap)
+            return pd.DataFrame({"Error": ["Selected query not found."]})
+        load_db_btn.click(
+            load_database_with_graphs,
+            inputs=[db_dropdown],
+            outputs=[status, graph_output],
+        )
+        canned_query_dropdown.change(
+            update_selected_query,
+            inputs=[canned_query_dropdown],
+            outputs=[selected_query],
+        )
+        canned_query_submit.click(
+            submit_canned_query,
+            inputs=[canned_query_dropdown, limit_input, wrap_checkbox],
+            outputs=output,
+        )
+        sql_submit.click(
+            query_db,
+            inputs=[sql_input, gr.Checkbox(value=True), limit_input, wrap_checkbox],
+            outputs=output,
+        )
+    return demo
+demo = create_demo()
+def close_db():
+    global db
+    if db is not None:
+        db.close()
+        db = None
+# def launch():
+#     print("Launching Gradio app...", flush=True)
+#     demo.launch(share=True)
+#     print(
+#         "Gradio app launched. If you don't see a URL above, there might be network restrictions.",
+#         flush=True,
+#     )
+#     close_db()
+# if __name__ == "__main__":
+#     launch()
+# Mount the Gradio app
+app = mount_gradio_app(app, demo, path="/")
+@app.exception_handler(Exception)
+async def exception_handler(request: Request, exc: Exception):
+    print(f"An error occurred: {str(exc)}")
+    return {"error": str(exc)}
+@app.on_event("startup")
+async def startup_event():
+    # You can initialize the database here if needed
+    pass
+@app.on_event("shutdown")
+async def shutdown_event():
+    close_db()
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

scripts/run_db_interface_js.py ADDED Viewed

File without changes