Spaces:

lonardonifabio
/

techdocuments

Running

techdocuments / processor.py

Upload 4 files

7c17b1d verified 10 months ago

1.75 kB

	import os
	import json
	import requests
	from datetime import datetime
	from llama_runner import summarize_text

	REPO_API = "https://api.github.com/repos/lonardonifabio/Artificial-Intelligence-and-Data-Science/contents/"
	DOC_FOLDER = "docs_temp/"
	STORE_FILE = "document_store.json"

	os.makedirs(DOC_FOLDER, exist_ok=True)

	def get_github_files():
	response = requests.get(REPO_API)
	return [f for f in response.json() if f["name"].endswith(".pdf")]

	def process_new_documents():
	files = get_github_files()
	try:
	with open(STORE_FILE, "r") as f:
	processed = {d["titolo"] for d in json.load(f)}
	except FileNotFoundError:
	processed = set()
	with open(STORE_FILE, "w") as f:
	json.dump([], f)

	new_docs = []
	for file in files:
	if file["name"] in processed:
	continue
	r = requests.get(file["download_url"])
	pdf_path = os.path.join(DOC_FOLDER, file["name"])
	with open(pdf_path, "wb") as f:
	f.write(r.content)

	# Estrazione testo grezza (es. via PyMuPDF o pdfminer)
	from PyPDF2 import PdfReader
	reader = PdfReader(pdf_path)
	text = "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])

	result = summarize_text(text)
	result["titolo"] = file["name"]
	result["link"] = file["html_url"]
	result["data_caricamento"] = datetime.utcnow().isoformat()

	new_docs.append(result)

	# Append to store
	if new_docs:
	with open(STORE_FILE, "r") as f:
	data = json.load(f)
	data.extend(new_docs)
	with open(STORE_FILE, "w") as f:
	json.dump(data, f, indent=2)