File size: 2,175 Bytes
7ed183c
0170414
7ed183c
 
 
 
 
 
 
 
 
 
 
 
0170414
7ed183c
 
 
 
 
 
 
 
0170414
7ed183c
 
 
 
0170414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File: orchestrator/provenance.py
from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, create_engine
from sqlalchemy.orm import declarative_base, relationship, sessionmaker
from datetime import datetime

Base = declarative_base()

class Paper(Base):
    __tablename__ = 'papers'
    id = Column(String, primary_key=True)
    title = Column(String)
    authors = Column(String)
    abstract = Column(String)
    fetched_at = Column(DateTime, default=datetime.utcnow)
    runs = relationship('Run', back_populates='paper')

class Run(Base):
    __tablename__ = 'runs'
    id = Column(Integer, primary_key=True, autoincrement=True)
    paper_id = Column(String, ForeignKey('papers.id'))
    cell_index = Column(Integer)
    output = Column(String)
    executed_at = Column(DateTime, default=datetime.utcnow)
    paper = relationship('Paper', back_populates='runs')

def init_db(db_url: str):
    engine = create_engine(db_url)
    Base.metadata.create_all(engine)
    return sessionmaker(bind=engine)

# File: scripts/ingest.py
import sys
import yaml
from orchestrator.client import MCPClient

"""
Usage:
    python ingest.py "search query"
"""
if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('Please provide a search query.')
        sys.exit(1)
    query = sys.argv[1]
    cfg = yaml.safe_load(open('config.yaml'))
    web = MCPClient(cfg['mcp_servers']['web_search'])
    pubmed = MCPClient(cfg['mcp_servers']['pubmed'])
    chroma = MCPClient(cfg['mcp_servers']['chroma'])

    print(f'Ingesting papers for query: {query}')
    papers = []
    try:
        papers += web.call('web_search.search', {'q': query}) or []
    except Exception as e:
        print('Web search error:', e)
    try:
        papers += pubmed.call('metatool.query', {'source': 'PubMed', 'q': query}) or []
    except Exception as e:
        print('PubMed error:', e)

    for paper in papers:
        pid = paper.get('id')
        txt = paper.get('abstract', '')
        meta = {'title': paper.get('title'), 'authors': ','.join(paper.get('authors', []))}
        chroma.call('chroma.insert', {'id': pid, 'text': txt, 'metadata': meta})
    print('Done ingesting!')