File size: 5,349 Bytes
b160e5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#DocArrayInMemorySearch is a document index provided by Docarray that stores documents in memory. 
#It is a great starting point for small datasets, where you may not want to launch a database server.

# import libraries
import streamlit as st
import requests
from bs4 import BeautifulSoup
#from langchain.indexes import VectorstoreIndexCreator      #Logic for creating indexes.
#from langchain.vectorstores import DocArrayInMemorySearch  #document index provided by Docarray that stores documents in memory.
from sentence_transformers import SentenceTransformer
from langchain_community.llms import HuggingFaceEndpoint
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings,)
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains import RetrievalQA

#import vertexai
#from langchain.llms import VertexAI
#from langchain.embeddings import VertexAIEmbeddings

#vertexai.init(project=PROJECT, location=LOCATION)        #GCP PROJECT ID, LOCATION as region.

#The PaLM 2 for Text (text-bison, text-unicorn) foundation models are optimized for a variety of natural language 
#tasks such as sentiment analysis, entity extraction, and content creation. The types of content that the PaLM 2 for
#Text models can create include document summaries, answers to questions, and labels that classify content.

llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2", Temperature=0.3)
#model = SentenceTransformer("all-MiniLM-L6-v2")

#llm = VertexAI(model_name="text-bison@001",max_output_tokens=256,temperature=0.1,top_p=0.8,top_k=40,verbose=True,)
    
#embeddings = VertexAIEmbeddings()
#embeddings = model.encode(sentences)

#The below code scrapes all the text data from the webpage link provided by the user and saves it in a text file.
def get_text(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Create a BeautifulSoup object with the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the specific element or elements containing the text you want to scrape
    # Here, we'll find all <p> tags and extract their text
    paragraphs = soup.find_all("p")

    # Loop through the paragraphs and print their text
    with open("text\\temp.txt", "w", encoding='utf-8') as file:
        # Loop through the paragraphs and write their text to the file
        for paragraph in paragraphs:
            file.write(paragraph.get_text() + "\n")

@st.cache_resource
def create_langchain_index(input_text):
    print("--indexing---")
    get_text(input_text)
    loader = TextLoader("text\\temp.txt", encoding='utf-8')
    documents = loader.load()
    # split it into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs = text_splitter.split_documents(documents)
    # create the open-source embedding function
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    # load it into Chroma
    db = Chroma.from_documents(docs, embeddings)
    persist_directory = "chroma_db"  
    vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=persist_directory)
    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    return db

# @st.cache_resource
# def get_basic_page_details(input_text,summary_query,tweet_query,ln_query):
#     index = create_langchain_index(input_text)
#     summary_response = index.query(summary_query)
#     tweet_response = index.query(tweet_query)
#     ln_response = index.query(ln_query)

#     return summary_response,tweet_response,ln_response


@st.cache_data
def get_response(input_text,query,_db):
    print(f"--querying---{query}")
    retrieval_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=db.as_retriever())
    response = retrieval_chain.run(query)
    #response = index.query(query,llm=llm)
    return response

#The below code is a simple flow to accept the webpage link and process the queries
#using the get_response function created above. Using the cache, the same.

st.title('Webpage Question and Answering ')


input_text=st.text_input("Provide the link to the webpage...")

summary_response = ""
tweet_response = ""
ln_response = ""
# if st.button("Load"):
if input_text:
    db = create_langchain_index(input_text)
    summary_query ="Write a 100 words summary of the document"
    summary_response = get_response(input_text,summary_query,db)

    tweet_query ="Write a twitter tweet"
    tweet_response =  get_response(input_text,tweet_query,db)

    ln_query ="Write a linkedin post for the document"
    ln_response = get_response(input_text,ln_query,db)


    with st.expander('Page Summary'): 
        st.info(summary_response)

    with st.expander('Tweet'): 
        st.info(tweet_response)

    with st.expander('LinkedIn Post'): 
        st.info(ln_response)


st.session_state.input_text = ''    
question=st.text_input("Ask a question from the link you shared...")
if st.button("Ask"):
        if question:
            db = create_langchain_index(input_text)
            response = get_response(input_text,question,db)
            st.write(response)
        else:
            st.warning("Please enter a question.")