Spaces:
Runtime error
Runtime error
| from dotenv import load_dotenv | |
| from langchain import OpenAI | |
| from langchain.chains.summarize import load_summarize_chain | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from load import parse_document | |
| load_dotenv() | |
| DOCUMENT_PATH = "data/raw/cixiidae" | |
| llm = OpenAI(temperature=0) | |
| def summarize(raw_documents): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| separators=["\n\n", "\n"], chunk_size=6000, chunk_overlap=300 | |
| ) | |
| docs = text_splitter.split_documents(raw_documents) | |
| num_docs = len(docs) | |
| num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content) | |
| print( | |
| f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens" | |
| ) | |
| summary_chain = load_summarize_chain(llm=llm, chain_type="map_reduce") | |
| output = summary_chain.run(docs) | |
| return output | |
| def main(): | |
| name = "Fulgoroidea2008-FulgoromorphaSeychellesPreliminaryChecklis-Holzinger-LöckerLöcker" | |
| raw_documents = parse_document(f"data/raw/cixiidae/${name}.pdf") | |
| output = summarize(raw_documents) | |
| print(output) | |
| with open( | |
| "data/processed/cixiidae/${name}-summary.txt", | |
| "w", | |
| ) as f: | |
| f.write(output) | |
| if __name__ == "__main__": | |
| main() | |