File size: 5,205 Bytes
cc52072
491f5a9
 
4645d7b
 
 
 
 
 
 
 
 
 
fb003ff
e3220cf
 
4645d7b
 
 
 
 
 
 
 
 
 
 
 
f3ddc79
 
 
e3220cf
fa411ea
e3220cf
 
4645d7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637f518
 
4645d7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578ad1c
 
4645d7b
e3220cf
 
 
 
 
578ad1c
e3220cf
fb003ff
e3220cf
1a2a307
4645d7b
578ad1c
06463af
4645d7b
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

import streamlit as st

import os
import torch 
import transformers
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter ,CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader ,PyPDFLoader ,DirectoryLoader
from langchain.document_loaders import GoogleDriveLoader
#from datasets import load_dataset
#dataset = load_dataset("heyal/carbon_data")

def create_vecotrstore(embedding , texts, db_name = 'chromadb' ) -> None:
  "Extract vector embeddings from text and store to persistance directory and return vector object."

  persist_directory = db_name
  print("Creating vector store.")
  vectordb = Chroma.from_documents(documents=texts,
                                   embedding=embedding,
                                   persist_directory=persist_directory)
  
  return vectordb

#"Load and chunk from documents to small text chunks."  
def load_chunk(data_dir):
  
  loader = DirectoryLoader(data_dir , glob="./*.pdf", loader_cls=PyPDFLoader)
  #loader = GoogleDriveLoader(folder_id = data_dir, glob="./*.pdf", loader_cls=PyPDFLoader, credentials_path='googlecreds.json')
  documents = loader.load()
  #documents = dataset
  print(f"{len(documents)} documents are loaded.")

  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                                 chunk_overlap=20,
                                                 length_function = len,
                                                 separators=["\n\n", "\n", " ", ""])
  

  text_chunks = text_splitter.split_documents(documents)
  print(f"{len(text_chunks)} are splitted from documents.")

  return text_chunks

  import textwrap

def format_result(text, width=100):
  "Format to readable text form"

  lines = text.split('\n')
  wrapped_lines = '\n'.join([textwrap.fill(line, width=width) for line in lines])
  
  return wrapped_lines
  

def postprocess_response(llm_response):
  #" Format LLM response , query and semantic search results ."
  
  print(f"Query  : {format_result(llm_response['query'])} \n")
  print(f"Result : {format_result(llm_response['result'])} \n")
  print('=' *90)
  print('\nFounded docs (text chunks from PDFs): \n\n')
  for source in llm_response["source_documents"]:
    print(f"Source PDF : {source.metadata['source']} \n\n")
    print(format_result(source.page_content))
    
    print('-' *90)


def postprocess_response_in_app(llm_response):
  st.write(format_result(llm_response['result']))


from langchain.embeddings import HuggingFaceEmbeddings


def init_embedding(model_name : str): 
  "Initialize text embedding model "

  embeddings = HuggingFaceEmbeddings(model_name = model_name,
                                     model_kwargs={"device": "cuda"})
  return embeddings


def init_LLM(model_name : str): 
  "Initialize LLM for text generation "
  
  llm = HuggingFacePipeline.from_model_id(model_id = model_name,
                                          task="text2text-generation",
                                          device = 0,
                                          model_kwargs={"temperature":0,
                                                        "max_length" : 512 ,})
  return llm

from langchain.llms import OpenAI
#llm_model_id = "google/flan-t5-large"

#for embeddings
text_model_id = "all-mpnet-base-v2"

text_embeddings = init_embedding(text_model_id)
#llm_model = init_LLM(llm_model_id)
API = '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'
llm_model = OpenAI(temperature=0.7, openai_api_key=API)

def generate_context(llm_model , vectordb , query : str , top_k : int):
  "Generate context information from query"

  # fetch similar docs using similarity serch
  retriever = vectordb.as_retriever(search_kwargs={"k": top_k})
  
  # generate text using founded docs
  qa_chain = RetrievalQA.from_chain_type(llm=llm_model,
                                         chain_type="stuff", 
                                         retriever=retriever,
                                         return_source_documents=True)
  
  results = qa_chain(query)

  return results


#app
st.title("Omdena-Transitry Carbon Project Demo")

st.write("Mounting Google drive")

from google.colab import drive
drive.mount('/content/drive/')

st.write("Loading documents")
data_dir = '/content/drive/My Drive/carbon_data'
#data_dir = 'https://drive.google.com/drive/folders/1sSZGhGzXw6oqC8sxKtPwIuaDvx_PfMlh'
#data_dir = '1sSZGhGzXw6oqC8sxKtPwIuaDvx_PfMlh'
texts = load_chunk(data_dir)

st.write("Creating vector store")
vectordb = create_vecotrstore(text_embeddings , texts)

user_question = st.text_input(
    "Enter Your Question : ",
    placeholder = "Cyanobacteria can perform photosynthetsis , are they considered as plants?",
)

#query = f"Can I develop a project whose purpose is to increase biodiversity? if so, how could biodiversity result in carbon credits?"

query = user_question

results = generate_context(llm_model , vectordb , query , 3)

postprocess_response(results)