DEFAULT_MODEL_ID = "Meta-Llama-3-70B-Instruct" DEFAULT_INTERFACE_MODEL_ID = "NumbersStation/nsql-llama-2-7B" DEFAULT_KIND = "json" DEFAULT_TEMPERATURE = 0.6 DEFAULT_TOP_P = 0.95 DEFAULT_FEW_SHOT_NUM = 3 DEFAULT_FEW_SHOT_SELECTION = "random" DEFAULT_SAVE_INTERVAL = 3 DEFAULT_RES_DIR = "data/results" DEFAULT_LOG_DIR = "logs" DEFAULT_TABLES_DIR = "data/databases" COOCCURRENCE_QUERY = """ WITH concept_pairs AS ( SELECT p1.concept AS concept1, p2.concept AS concept2, p1.paper_id, p1.tag_type FROM predictions p1 JOIN predictions p2 ON p1.paper_id = p2.paper_id AND p1.concept < p2.concept WHERE p1.tag_type = p2.tag_type ) SELECT concept1, concept2, tag_type, COUNT(DISTINCT paper_id) AS co_occurrences FROM concept_pairs GROUP BY concept1, concept2, tag_type HAVING co_occurrences > 5 ORDER BY co_occurrences DESC; """ canned_queries = [ ( "Modalities in Physics and Astronomy papers", """ SELECT DISTINCT LOWER(concept) AS concept FROM predictions JOIN ( SELECT paper_id, url FROM papers WHERE primary_category LIKE '%physics.space-ph%' OR primary_category LIKE '%astro-ph.%' ) AS paper_ids ON predictions.paper_id = paper_ids.paper_id WHERE predictions.tag_type = 'modality' """, ), ( "Datasets in Evolutionary Biology that use PDEs", """ WITH pde_predictions AS ( SELECT paper_id, concept AS pde_concept, tag_type AS pde_tag_type FROM predictions WHERE tag_type IN ('method', 'model') AND ( LOWER(concept) LIKE '%pde%' OR LOWER(concept) LIKE '%partial differential equation%' ) ) SELECT DISTINCT papers.paper_id, papers.url, LOWER(p_dataset.concept) AS dataset, pde_predictions.pde_concept AS pde_related_concept, pde_predictions.pde_tag_type AS pde_related_type FROM papers JOIN pde_predictions ON papers.paper_id = pde_predictions.paper_id LEFT JOIN predictions p_dataset ON papers.paper_id = p_dataset.paper_id WHERE papers.primary_category LIKE '%q-bio.PE%' AND (p_dataset.tag_type = 'dataset' OR p_dataset.tag_type IS NULL) ORDER BY papers.paper_id, dataset, pde_related_concept; """, ), ( "Trends in objects of study in Cosmology since 2019", """ SELECT substr(papers.updated_on, 2, 4) as year, predictions.concept as object, COUNT(DISTINCT papers.paper_id) as paper_count FROM papers JOIN predictions ON papers.paper_id = predictions.paper_id WHERE predictions.tag_type = 'object' AND CAST(SUBSTR(papers.updated_on, 2, 4) AS INTEGER) >= 2019 GROUP BY year, object ORDER BY year DESC, paper_count DESC; """, ), ( "New datasets in fluid dynamics since 2020", """ WITH ranked_datasets AS ( SELECT p.paper_id, p.url, pred.concept AS dataset, p.updated_on, ROW_NUMBER() OVER (PARTITION BY pred.concept ORDER BY p.updated_on ASC) AS rn FROM papers p JOIN predictions pred ON p.paper_id = pred.paper_id WHERE pred.tag_type = 'dataset' AND p.primary_category LIKE '%physics.flu-dyn%' AND CAST(SUBSTR(p.updated_on, 2, 4) AS INTEGER) >= 2020 ) SELECT paper_id, url, dataset, updated_on FROM ranked_datasets WHERE rn = 1 ORDER BY updated_on ASC """, ), ( "Evolutionary biology datasets that use spatiotemporal dynamics", """ WITH evo_bio_papers AS ( SELECT paper_id FROM papers WHERE primary_category LIKE '%q-bio.PE%' ), spatiotemporal_keywords AS ( SELECT 'spatio-temporal' AS keyword UNION SELECT 'spatiotemporal' UNION SELECT 'spatio-temporal' UNION SELECT 'spatial and temporal' UNION SELECT 'space-time' UNION SELECT 'geographic distribution' UNION SELECT 'phylogeograph' UNION SELECT 'biogeograph' UNION SELECT 'dispersal' UNION SELECT 'migration' UNION SELECT 'range expansion' UNION SELECT 'population dynamics' ) SELECT DISTINCT p.paper_id, p.updated_on, p.abstract, d.concept AS dataset, GROUP_CONCAT(DISTINCT stk.keyword) AS spatiotemporal_keywords_found FROM evo_bio_papers ebp JOIN papers p ON ebp.paper_id = p.paper_id JOIN predictions d ON p.paper_id = d.paper_id JOIN predictions st ON p.paper_id = st.paper_id JOIN spatiotemporal_keywords stk WHERE d.tag_type = 'dataset' AND st.tag_type = 'modality' AND LOWER(st.concept) LIKE '%' || stk.keyword || '%' GROUP BY p.paper_id, p.updated_on, p.abstract, d.concept ORDER BY p.updated_on DESC """, ), ( "What percentage of papers use only galaxy or spectra, or both or neither?", """ WITH paper_modalities AS ( SELECT p.paper_id, MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_galaxy_images, MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra FROM papers p LEFT JOIN predictions pred ON p.paper_id = pred.paper_id WHERE p.primary_category LIKE '%astro-ph%' AND pred.tag_type = 'modality' GROUP BY p.paper_id ), categorized_papers AS ( SELECT CASE WHEN uses_galaxy_images = 1 AND uses_spectra = 1 THEN 'Both' WHEN uses_galaxy_images = 1 THEN 'Only Galaxy Images' WHEN uses_spectra = 1 THEN 'Only Spectra' ELSE 'Neither' END AS category, COUNT(*) AS paper_count FROM paper_modalities GROUP BY CASE WHEN uses_galaxy_images = 1 AND uses_spectra = 1 THEN 'Both' WHEN uses_galaxy_images = 1 THEN 'Only Galaxy Images' WHEN uses_spectra = 1 THEN 'Only Spectra' ELSE 'Neither' END ) SELECT category, paper_count, ROUND(CAST(paper_count AS FLOAT) / (SELECT SUM(paper_count) FROM categorized_papers) * 100, 2) AS percentage FROM categorized_papers ORDER BY paper_count DESC """, ), ( "What are all the next highest data modalities after images and spectra?", """ SELECT LOWER(concept) AS modality, COUNT(DISTINCT paper_id) AS usage_count FROM predictions WHERE tag_type = 'modality' AND LOWER(concept) NOT LIKE '%imag%' AND LOWER(concept) NOT LIKE '%spectr%' GROUP BY LOWER(concept) ORDER BY usage_count DESC """, ), ( "If we include the next biggest data modality, how much does coverage change?", """ WITH modality_counts AS ( SELECT LOWER(concept) AS modality, COUNT(DISTINCT paper_id) AS usage_count FROM predictions WHERE tag_type = 'modality' AND LOWER(concept) NOT LIKE '%imag%' AND LOWER(concept) NOT LIKE '%spectr%' GROUP BY LOWER(concept) ORDER BY usage_count DESC LIMIT 1 ), paper_modalities AS ( SELECT p.paper_id, MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_galaxy_images, MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra, MAX(CASE WHEN LOWER(pred.concept) LIKE (SELECT '%' || modality || '%' FROM modality_counts) THEN 1 ELSE 0 END) AS uses_third_modality FROM papers p LEFT JOIN predictions pred ON p.paper_id = pred.paper_id WHERE p.primary_category LIKE '%astro-ph%' AND pred.tag_type = 'modality' GROUP BY p.paper_id ), coverage_before AS ( SELECT SUM(CASE WHEN uses_galaxy_images = 1 OR uses_spectra = 1 THEN 1 ELSE 0 END) AS covered_papers, COUNT(*) AS total_papers FROM paper_modalities ), coverage_after AS ( SELECT SUM(CASE WHEN uses_galaxy_images = 1 OR uses_spectra = 1 OR uses_third_modality = 1 THEN 1 ELSE 0 END) AS covered_papers, COUNT(*) AS total_papers FROM paper_modalities ) SELECT (SELECT modality FROM modality_counts) AS third_modality, ROUND(CAST(covered_papers AS FLOAT) / total_papers * 100, 2) AS coverage_before_percent, ROUND(CAST((SELECT covered_papers FROM coverage_after) AS FLOAT) / total_papers * 100, 2) AS coverage_after_percent, ROUND(CAST((SELECT covered_papers FROM coverage_after) AS FLOAT) / total_papers * 100, 2) - ROUND(CAST(covered_papers AS FLOAT) / total_papers * 100, 2) AS coverage_increase_percent FROM coverage_before """, ), ( "Coverage if we select the next 5 highest modalities?", """ WITH ranked_modalities AS ( SELECT LOWER(concept) AS modality, COUNT(DISTINCT paper_id) AS usage_count, ROW_NUMBER() OVER (ORDER BY COUNT(DISTINCT paper_id) DESC) AS rank FROM predictions WHERE tag_type = 'modality' AND LOWER(concept) NOT LIKE '%imag%' AND LOWER(concept) NOT LIKE '%spectr%' GROUP BY LOWER(concept) ), paper_modalities AS ( SELECT p.paper_id, MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_images, MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra, MAX(CASE WHEN rm.rank = 1 THEN 1 ELSE 0 END) AS uses_modality_1, MAX(CASE WHEN rm.rank = 2 THEN 1 ELSE 0 END) AS uses_modality_2, MAX(CASE WHEN rm.rank = 3 THEN 1 ELSE 0 END) AS uses_modality_3, MAX(CASE WHEN rm.rank = 4 THEN 1 ELSE 0 END) AS uses_modality_4, MAX(CASE WHEN rm.rank = 5 THEN 1 ELSE 0 END) AS uses_modality_5 FROM papers p LEFT JOIN predictions pred ON p.paper_id = pred.paper_id LEFT JOIN ranked_modalities rm ON LOWER(pred.concept) = rm.modality WHERE p.primary_category LIKE '%astro-ph%' AND pred.tag_type = 'modality' GROUP BY p.paper_id ), cumulative_coverage AS ( SELECT 'Images and Spectra' AS modalities, 0 AS added_modality_rank, SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 THEN 1 ELSE 0 END) AS covered_papers, COUNT(*) AS total_papers FROM paper_modalities UNION ALL SELECT 'Images, Spectra, and Modality 1' AS modalities, 1 AS added_modality_rank, SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 THEN 1 ELSE 0 END) AS covered_papers, COUNT(*) AS total_papers FROM paper_modalities UNION ALL SELECT 'Images, Spectra, Modality 1, and 2' AS modalities, 2 AS added_modality_rank, SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 THEN 1 ELSE 0 END) AS covered_papers, COUNT(*) AS total_papers FROM paper_modalities UNION ALL SELECT 'Images, Spectra, Modality 1, 2, and 3' AS modalities, 3 AS added_modality_rank, SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 THEN 1 ELSE 0 END) AS covered_papers, COUNT(*) AS total_papers FROM paper_modalities UNION ALL SELECT 'Images, Spectra, Modality 1, 2, 3, and 4' AS modalities, 4 AS added_modality_rank, SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 OR uses_modality_4 = 1 THEN 1 ELSE 0 END) AS covered_papers, COUNT(*) AS total_papers FROM paper_modalities UNION ALL SELECT 'Images, Spectra, Modality 1, 2, 3, 4, and 5' AS modalities, 5 AS added_modality_rank, SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 OR uses_modality_4 = 1 OR uses_modality_5 = 1 THEN 1 ELSE 0 END) AS covered_papers, COUNT(*) AS total_papers FROM paper_modalities ) SELECT cc.modalities, COALESCE(rm.modality, 'N/A') AS added_modality, rm.usage_count AS added_modality_usage, ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2) AS coverage_percent, ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2) - LAG(ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2), 1, 0) OVER (ORDER BY cc.added_modality_rank) AS coverage_increase_percent FROM cumulative_coverage cc LEFT JOIN ranked_modalities rm ON cc.added_modality_rank = rm.rank ORDER BY cc.added_modality_rank """, ), ( "List all papers", "SELECT paper_id, abstract AS abstract_preview, authors, primary_category FROM papers", ), ( "Count papers by category", "SELECT primary_category, COUNT(*) as paper_count FROM papers GROUP BY primary_category ORDER BY paper_count DESC", ), ( "Top authors with most papers", """ WITH author_papers AS ( SELECT json_each.value AS author FROM papers, json_each(papers.authors) ) SELECT author, COUNT(*) as paper_count FROM author_papers GROUP BY author ORDER BY paper_count DESC """, ), ( "Papers with 'quantum' in abstract", "SELECT paper_id, abstract AS abstract_preview FROM papers WHERE abstract LIKE '%quantum%'", ), ( "Most common concepts", "SELECT concept, COUNT(*) as concept_count FROM predictions GROUP BY concept ORDER BY concept_count DESC", ), ( "Papers with multiple authors", """ SELECT paper_id, json_array_length(authors) as author_count, authors FROM papers WHERE json_array_length(authors) > 1 ORDER BY author_count DESC """, ), ]