techdocuments / processor.py
lonardonifabio's picture
Upload 4 files
7c17b1d verified
import os
import json
import requests
from datetime import datetime
from llama_runner import summarize_text
REPO_API = "https://api.github.com/repos/lonardonifabio/Artificial-Intelligence-and-Data-Science/contents/"
DOC_FOLDER = "docs_temp/"
STORE_FILE = "document_store.json"
os.makedirs(DOC_FOLDER, exist_ok=True)
def get_github_files():
response = requests.get(REPO_API)
return [f for f in response.json() if f["name"].endswith(".pdf")]
def process_new_documents():
files = get_github_files()
try:
with open(STORE_FILE, "r") as f:
processed = {d["titolo"] for d in json.load(f)}
except FileNotFoundError:
processed = set()
with open(STORE_FILE, "w") as f:
json.dump([], f)
new_docs = []
for file in files:
if file["name"] in processed:
continue
r = requests.get(file["download_url"])
pdf_path = os.path.join(DOC_FOLDER, file["name"])
with open(pdf_path, "wb") as f:
f.write(r.content)
# Estrazione testo grezza (es. via PyMuPDF o pdfminer)
from PyPDF2 import PdfReader
reader = PdfReader(pdf_path)
text = "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])
result = summarize_text(text)
result["titolo"] = file["name"]
result["link"] = file["html_url"]
result["data_caricamento"] = datetime.utcnow().isoformat()
new_docs.append(result)
# Append to store
if new_docs:
with open(STORE_FILE, "r") as f:
data = json.load(f)
data.extend(new_docs)
with open(STORE_FILE, "w") as f:
json.dump(data, f, indent=2)