File size: 4,226 Bytes
c1f8ca0
 
 
 
9b89f54
c1f8ca0
 
 
 
 
 
 
 
7610853
 
 
c1f8ca0
35afc76
 
f22c143
7610853
 
 
c1f8ca0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7610853
 
 
 
 
 
 
 
 
 
 
 
9f3e791
7610853
 
 
9b89f54
 
 
 
 
 
 
 
 
 
 
 
 
7610853
c1f8ca0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35afc76
 
c1f8ca0
f22c143
 
c1f8ca0
 
 
 
7610853
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import streamlit as st
from langchain import PromptTemplate
from langchain.llms import Replicate
import os 
from langchain.document_loaders import YoutubeLoader, PyPDFLoader
import requests
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores.base import VectorStoreRetriever
from langchain.prompts import PromptTemplate
from lxml import etree
from langchain.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup

st.set_page_config(page_title="πŸ¦œπŸ”— Ask a LLM to know more about me")
st.title('πŸ¦œπŸ”— Ask a LLM to know more about me')
os.environ["REPLICATE_API_TOKEN"] = st.secrets["REPLICATE_API_TOKEN"]

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

@st.cache_resource
def get_query_chain():
  model_name = "sentence-transformers/all-mpnet-base-v2"
  model_kwargs = {'device': 'cpu'}
  encode_kwargs = {'normalize_embeddings': False}
  hf = HuggingFaceEmbeddings(
      model_name=model_name,
      model_kwargs=model_kwargs,
      encode_kwargs=encode_kwargs
  )
  loader = YoutubeLoader.from_youtube_url(
      "https://www.youtube.com/watch?v=pAYrk3f9xRk", add_video_info=True
  )
  my_url = "https://www.youtube.com/@rrwithdeku8677/videos"
  r = requests.get(my_url)
  page = (r.text)
  pattern = r'watch\?v=([^"]+)'
  matches = re.findall(pattern, page, re.IGNORECASE)
  ids = [x.split('=')[-1] for x in matches]
  base_url =  "https://www.youtube.com/watch?v="

  video_data=  []

  #TODO - Cache this and only do this if there is a new video
  for id in ids:
      loader = YoutubeLoader.from_youtube_url(
          base_url + id, add_video_info=True
      )
      print("got loader")
      data = loader.load()
      video_data.extend(data)
  profile_url = "https://ayushtues.medium.com"
  response = requests.get(profile_url)
  soup = BeautifulSoup(response.content, 'html.parser')
  links = []
  for link in soup.findAll('a'):
      x = link.get('href')
      if x.startswith('/')  and has_numbers(x) :
          links.append(link.get('href'))
  links = list(set(links))

  links = [profile_url+ x.split('?source')[0] for x in links]
  # print(links)
  links += ["https://ayushtues.github.io/"]
  loader = WebBaseLoader(links)
  data = loader.load()
  video_data.extend(data)

  url = 'https://huggingface.co/spaces/ayushtues/personal-assistant/resolve/main/resume.pdf'
  r = requests.get(url, stream=True)

  with open('resume.pdf', 'wb') as fd:
      for chunk in r.iter_content(2000):
          fd.write(chunk)

  loader = PyPDFLoader("resume.pdf")
  pages = loader.load()
  video_data.extend(pages)


  # print(data)
  text_splitter = RecursiveCharacterTextSplitter(chunk_size = 100, chunk_overlap = 0)
  all_splits = text_splitter.split_documents(video_data)
  vectorstore = FAISS.from_documents(documents=all_splits, embedding=hf)
  retriever = VectorStoreRetriever(vectorstore=vectorstore)
  print("got retriever")
  template = """Use the following pieces of context to answer the question at the end. 
  If you don't know the answer, just say that you don't know, don't try to make up an answer. 
  Use three sentences maximum and keep the answer as concise as possible. 
  Always say "thanks for asking!" at the end of the answer. 
  {context}
  Question: {question}
  Helpful Answer:"""
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
  llm = Replicate(
    model="a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5",
    input={"temperature": 0.75, "max_length": 500, "top_p": 1},
)
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  return qa_chain

def generate_response(topic, query_chain):
  result = query_chain({"query": topic})
  # print(result)
  return st.info(result['result'])

query_chain = get_query_chain()

with st.form('myform'):
  topic_text = st.text_input('Enter keyword:', '')
  submitted = st.form_submit_button('Submit')
  if submitted :
    generate_response(topic_text, query_chain)