Spaces:
Runtime error
Runtime error
File size: 5,205 Bytes
cc52072 491f5a9 4645d7b fb003ff e3220cf 4645d7b f3ddc79 e3220cf fa411ea e3220cf 4645d7b 637f518 4645d7b 578ad1c 4645d7b e3220cf 578ad1c e3220cf fb003ff e3220cf 1a2a307 4645d7b 578ad1c 06463af 4645d7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import streamlit as st
import os
import torch
import transformers
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter ,CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader ,PyPDFLoader ,DirectoryLoader
from langchain.document_loaders import GoogleDriveLoader
#from datasets import load_dataset
#dataset = load_dataset("heyal/carbon_data")
def create_vecotrstore(embedding , texts, db_name = 'chromadb' ) -> None:
"Extract vector embeddings from text and store to persistance directory and return vector object."
persist_directory = db_name
print("Creating vector store.")
vectordb = Chroma.from_documents(documents=texts,
embedding=embedding,
persist_directory=persist_directory)
return vectordb
#"Load and chunk from documents to small text chunks."
def load_chunk(data_dir):
loader = DirectoryLoader(data_dir , glob="./*.pdf", loader_cls=PyPDFLoader)
#loader = GoogleDriveLoader(folder_id = data_dir, glob="./*.pdf", loader_cls=PyPDFLoader, credentials_path='googlecreds.json')
documents = loader.load()
#documents = dataset
print(f"{len(documents)} documents are loaded.")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
chunk_overlap=20,
length_function = len,
separators=["\n\n", "\n", " ", ""])
text_chunks = text_splitter.split_documents(documents)
print(f"{len(text_chunks)} are splitted from documents.")
return text_chunks
import textwrap
def format_result(text, width=100):
"Format to readable text form"
lines = text.split('\n')
wrapped_lines = '\n'.join([textwrap.fill(line, width=width) for line in lines])
return wrapped_lines
def postprocess_response(llm_response):
#" Format LLM response , query and semantic search results ."
print(f"Query : {format_result(llm_response['query'])} \n")
print(f"Result : {format_result(llm_response['result'])} \n")
print('=' *90)
print('\nFounded docs (text chunks from PDFs): \n\n')
for source in llm_response["source_documents"]:
print(f"Source PDF : {source.metadata['source']} \n\n")
print(format_result(source.page_content))
print('-' *90)
def postprocess_response_in_app(llm_response):
st.write(format_result(llm_response['result']))
from langchain.embeddings import HuggingFaceEmbeddings
def init_embedding(model_name : str):
"Initialize text embedding model "
embeddings = HuggingFaceEmbeddings(model_name = model_name,
model_kwargs={"device": "cuda"})
return embeddings
def init_LLM(model_name : str):
"Initialize LLM for text generation "
llm = HuggingFacePipeline.from_model_id(model_id = model_name,
task="text2text-generation",
device = 0,
model_kwargs={"temperature":0,
"max_length" : 512 ,})
return llm
from langchain.llms import OpenAI
#llm_model_id = "google/flan-t5-large"
#for embeddings
text_model_id = "all-mpnet-base-v2"
text_embeddings = init_embedding(text_model_id)
#llm_model = init_LLM(llm_model_id)
API = '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'
llm_model = OpenAI(temperature=0.7, openai_api_key=API)
def generate_context(llm_model , vectordb , query : str , top_k : int):
"Generate context information from query"
# fetch similar docs using similarity serch
retriever = vectordb.as_retriever(search_kwargs={"k": top_k})
# generate text using founded docs
qa_chain = RetrievalQA.from_chain_type(llm=llm_model,
chain_type="stuff",
retriever=retriever,
return_source_documents=True)
results = qa_chain(query)
return results
#app
st.title("Omdena-Transitry Carbon Project Demo")
st.write("Mounting Google drive")
from google.colab import drive
drive.mount('/content/drive/')
st.write("Loading documents")
data_dir = '/content/drive/My Drive/carbon_data'
#data_dir = 'https://drive.google.com/drive/folders/1sSZGhGzXw6oqC8sxKtPwIuaDvx_PfMlh'
#data_dir = '1sSZGhGzXw6oqC8sxKtPwIuaDvx_PfMlh'
texts = load_chunk(data_dir)
st.write("Creating vector store")
vectordb = create_vecotrstore(text_embeddings , texts)
user_question = st.text_input(
"Enter Your Question : ",
placeholder = "Cyanobacteria can perform photosynthetsis , are they considered as plants?",
)
#query = f"Can I develop a project whose purpose is to increase biodiversity? if so, how could biodiversity result in carbon credits?"
query = user_question
results = generate_context(llm_model , vectordb , query , 3)
postprocess_response(results) |