# -*- coding: utf-8 -*- """Citation.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/11coAx2hnXJPn0vGl9_cNPhMtmUzCrIbS """ import pandas as pd # read in Workday Data onb = pd.read_excel('All_workday (1).xlsx') # read in bycels.mobi URLs bycels = pd.read_excel('TMC_bycels.xlsx') bycels= bycels.set_index('URL') bycels['Text'] = '' # read in Workday URLs and Titles scrape_links = pd.read_excel('TMC_scrape_links.xlsx') # Read in TuftsMedicine Addresses adrs = pd.read_excel('directions_full.xlsx') all = pd.concat([onb.rename(columns = {"web-scraper-start-url": 'link'}), adrs.rename(columns = {'web-scraper-start-url': 'link'})]) import requests import io import PyPDF2 for url in bycels.index: response = requests.get(url) pdf_io_bytes = io.BytesIO(response.content) text_list = [] pdf = PyPDF2.PdfReader(pdf_io_bytes) num_pages = len(pdf.pages) for page in range(num_pages): page_text = pdf.pages[page].extract_text() text_list.append(page_text) text = "\n".join(text_list) bycels.loc[url]['Text'] = text # PARAMETER chunk_sizes = 512 chunk_overlaps = 64 system_message = 'You are a Chatbot that helps new employees with onboarding tasks like setting up a direct deposit, logging into workday. Please give many precise and detailed instructions which answer the most recent question directly and never under any circumstance provide links or URLs. If the questions is not directly answered in the sources say: "Sorry I dont know the answer to that" and do not agree with the question unless there is direct evidence' from llama_index import Document from llama_index.node_parser import SimpleNodeParser from llama_index import VectorStoreIndex from llama_index import LLMPredictor, VectorStoreIndex, ServiceContext from langchain import OpenAI import os import openai from llama_index import Document, ListIndex from llama_index.langchain_helpers.text_splitter import TokenTextSplitter from llama_index import VectorStoreIndex, ServiceContext, LLMPredictor from llama_index.query_engine import PandasQueryEngine, RetrieverQueryEngine, CitationQueryEngine from llama_index.retrievers import RecursiveRetriever from llama_index.schema import IndexNode from llama_index import VectorStoreIndex, SimpleDirectoryReader from llama_index.indices.query.query_transform.base import HyDEQueryTransform from llama_index.query_engine.transform_query_engine import TransformQueryEngine from langchain.chat_models import ChatOpenAI from llama_hub.file.pymu_pdf.base import PyMuPDFReader from pathlib import Path from typing import List from pathlib import Path import gradio as gr def title(t): thing = list(scrape_links[t[1]['link'] == scrape_links['Link']]['title ']) if len(thing)>0: return thing[0] else: return 'aghag' documents = [Document( text=t[1]['text'], metadata={ "file_name": title(t), "URL" : t[1]['link'] }) for t in all.iterrows()] pdfs = [Document( text=t[1]['Text'], metadata={ "file_name": t[1]['Title'], "URL" : t[0] }) for t in bycels.iterrows()] #documents = [Document(text=t) for t in fin] openai.api_key = os.getenv("OpenAI_API_Key") llm_predictor = LLMPredictor( llm=ChatOpenAI(temperature=0, model_name="gpt-4", streaming=True)) service_context = ServiceContext.from_defaults( llm_predictor=llm_predictor, node_parser = SimpleNodeParser(text_splitter=TokenTextSplitter(chunk_size=chunk_sizes, chunk_overlap=chunk_overlaps))) nodes = service_context.node_parser.get_nodes_from_documents(documents + pdfs) # + service_context.node_parser.get_nodes_from_documents(merged) index2 = VectorStoreIndex(nodes) # retu from llama_index.memory import ChatMemoryBuffer chat_engine = index2.as_chat_engine( chat_mode="context", memory=ChatMemoryBuffer.from_defaults(token_limit=1500), system_prompt=system_message ) def ret_chat(user_input): response = chat_engine.chat( user_input) source = response.sources source_text = source[0].content url_and_rest = source_text[source_text.index('URL'): ] just_url =url_and_rest[: url_and_rest.index('\n\n') ] return str(response.response) + '\n\n Source: ' + str(just_url) chat_history = [] def chat_interface(user_input): response = ret_chat(user_input) return chat_history + [(user_input, response)] interface = gr.Interface( fn=chat_interface, inputs=gr.Textbox(label="How can you help me with Onboarding?"), outputs=gr.Chatbot(), title="Tufts Medicine Onboarding Helper", description="Onboarding Bot" ) interface.launch(debug=True) import gradio as gr chat_history = [] def chat_interface(user_input): global chat_history response = chat_gast(chat_history, user_input) chat_history.append((user_input, response)) return chat_history interface = gr.Interface( fn=chat_interface, inputs=gr.Textbox(label="How can you help me with Onboarding?"), outputs=gr.Chatbot(), live=True, title="Tufts Medicine Onboarding Helper", description="Onboarding Bot" ) interface.launch(debug=True, auth = ('username', 'password')) # lied about DUO # lied about Mealstipend # Car Permits # made up millage reimbusmment # made up tuition reimbusment # 40 minute meal break # workday platform