Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import UnstructuredPDFLoader, TextLoader # type: ignore | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_cohere import ChatCohere | |
| from langchain_core.messages import HumanMessage | |
| import dotenv | |
| from langchain_core.output_parsers import StrOutputParser | |
| # from langchain_community.vectorstores import Chroma | |
| from langchain.schema.runnable import RunnablePassthrough | |
| from langchain_cohere import CohereEmbeddings | |
| from langchain_core.prompts import PromptTemplate, ChatPromptTemplate | |
| from langchain.memory.summary_buffer import ConversationSummaryBufferMemory | |
| from langchain.chains import ConversationChain | |
| from langchain_core.prompts.chat import MessagesPlaceholder | |
| from langchain.agents import AgentExecutor, create_tool_calling_agent | |
| import os | |
| #from langchain_community.utilities import GoogleSearchAPIWrapper | |
| from langchain_core.tools import Tool | |
| from langchain_google_community import GoogleSearchAPIWrapper | |
| dotenv.load_dotenv() | |
| #file_path = ( "/home/hubsnippet/Downloads/papers/2205.11916v4.pdf") | |
| #load the file to memory | |
| #loader = PyPDFLoader(file_path) | |
| #load the file content to data variable | |
| #data = loader.load_and_split() | |
| # embed the file data in a vector store | |
| #print(data[0]) | |
| def parse_document(docs : str, question : str): | |
| # initialise an embedding for the vector store | |
| embeddings = CohereEmbeddings(model="embed-english-light-v3.0") | |
| # initialise the llm | |
| llm = ChatCohere(model='command-r-plus') | |
| # split the file into chunks | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size = 1000, | |
| chunk_overlap = 100 | |
| ) | |
| docs = text_splitter.split_text(docs) | |
| # initialize vectorstore | |
| faiss_vs = FAISS.from_texts(docs, embeddings) | |
| # res = faiss_vs.similarity_search(input, k=2) | |
| llm_retriever = faiss_vs.as_retriever(llm = llm, search_kwargs={'k':1}) | |
| res = llm_retriever.invoke(question)[0].page_content | |
| return res | |
| #os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY") | |
| #os.environ["GOOGLE_CSE_ID"] = os.getenv("GOOGLE_CSE_ID") | |
| #COHERE_API_KEY = os.getenv("COHERE_API_KEY") | |
| GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") | |
| GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID") | |
| COHERE_API_KEY = os.getenv("COHERE_API_KEY") | |
| # integrating an agent to perform the search with the URL | |
| llm = ChatCohere(model="command-r-plus", cohere_api_key=COHERE_API_KEY) | |
| # history = MessagesPlaceholder(variable_name="history") | |
| #question = "" | |
| #url = "" | |
| prompt_template = [ | |
| ("system", "You are a searh engine for a corpse of documentation. you will be provided with a url {url} \ | |
| the url is your only source of information, so you should search the url pages by key words \ | |
| you should only ground your responses with the url. \ | |
| If {question} has no related content from the url, simple response 'no related content to your question'"), | |
| ("human", "{question}"), | |
| ("placeholder", "{agent_scratchpad}"), | |
| ("ai", "")] | |
| # prompt_template = prompt_template.format(url=url, question=question) | |
| prompt = ChatPromptTemplate.from_messages([ | |
| #SystemMessage(content="You are a helpful assistant. You should use the google_search_name agent tool for information."), | |
| #HumanMessage(content="{input}"), | |
| #AIMessage(content="{output}"), | |
| ("system","You are a helpful virtual assistant." \ | |
| "You should only use the google_search_name agent tool to search for information when necessary."), | |
| #MessagesPlaceholder(variable_name="history"), | |
| ("human","{question}"), | |
| ("placeholder", "{agent_scratchpad}") | |
| ]) | |
| # prompt template | |
| prompt_text = ChatPromptTemplate.from_messages(prompt_template) | |
| # print(prompt_text) | |
| # prompt template input variables | |
| # prompt_text.input_variables = ["question", "url"], input_variables = ["question", "url"] | |
| search = GoogleSearchAPIWrapper(google_api_key = GOOGLE_API_KEY, google_cse_id = GOOGLE_CSE_ID) | |
| tool = Tool( | |
| name="google_search_name", | |
| description="The model should use this tool when it needs more information from the internet.", | |
| func=search.run, | |
| ) | |
| agent = create_tool_calling_agent( | |
| tools=[tool], | |
| llm=llm, | |
| #prompt = prompt_text | |
| prompt = prompt | |
| ) | |
| agent_executor = AgentExecutor( | |
| agent=agent, | |
| tools=[tool], | |
| verbose=False | |
| ) | |
| def parse_url(question : str) -> str: | |
| # initialise the llm | |
| response = agent_executor.invoke(input = {"question": question}) | |
| # add memmory to your conversation | |
| # chain your llm to prompt | |
| # chain = prompt_text | llm | StrOutputParser() | |
| # chain = conversation_llm | StrOutputParser() | |
| #response = chain.invoke(input = {"question" : question, "url":url}) | |
| return response | |
| # message = HumanMessage(content="inurl: https://learn.microsoft.com 'what are cloud security best practices'") | |
| # print(parse_url(message)) | |