Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""Citation.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/11coAx2hnXJPn0vGl9_cNPhMtmUzCrIbS | |
""" | |
import pandas as pd | |
# read in Workday Data | |
onb = pd.read_excel('All_workday (1).xlsx') | |
# read in bycels.mobi URLs | |
bycels = pd.read_excel('TMC_bycels.xlsx') | |
bycels= bycels.set_index('URL') | |
bycels['Text'] = '' | |
# read in Workday URLs and Titles | |
scrape_links = pd.read_excel('TMC_scrape_links.xlsx') | |
# Read in TuftsMedicine Addresses | |
adrs = pd.read_excel('directions_full.xlsx') | |
all = pd.concat([onb.rename(columns = {"web-scraper-start-url": 'link'}), adrs.rename(columns = {'web-scraper-start-url': 'link'})]) | |
import requests | |
import io | |
import PyPDF2 | |
for url in bycels.index: | |
response = requests.get(url) | |
pdf_io_bytes = io.BytesIO(response.content) | |
text_list = [] | |
pdf = PyPDF2.PdfReader(pdf_io_bytes) | |
num_pages = len(pdf.pages) | |
for page in range(num_pages): | |
page_text = pdf.pages[page].extract_text() | |
text_list.append(page_text) | |
text = "\n".join(text_list) | |
bycels.loc[url]['Text'] = text | |
# PARAMETER | |
chunk_sizes = 512 | |
chunk_overlaps = 64 | |
system_message = 'You are a Chatbot that helps new employees with onboarding tasks like setting up a direct deposit, logging into workday. Please give many precise and detailed instructions which answer the most recent question directly and never under any circumstance provide links or URLs. If the questions is not directly answered in the sources say: "Sorry I dont know the answer to that" and do not agree with the question unless there is direct evidence' | |
from llama_index import Document | |
from llama_index.node_parser import SimpleNodeParser | |
from llama_index import VectorStoreIndex | |
from llama_index import LLMPredictor, VectorStoreIndex, ServiceContext | |
from langchain import OpenAI | |
import os | |
import openai | |
from llama_index import Document, ListIndex | |
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter | |
from llama_index import VectorStoreIndex, ServiceContext, LLMPredictor | |
from llama_index.query_engine import PandasQueryEngine, RetrieverQueryEngine, CitationQueryEngine | |
from llama_index.retrievers import RecursiveRetriever | |
from llama_index.schema import IndexNode | |
from llama_index import VectorStoreIndex, SimpleDirectoryReader | |
from llama_index.indices.query.query_transform.base import HyDEQueryTransform | |
from llama_index.query_engine.transform_query_engine import TransformQueryEngine | |
from langchain.chat_models import ChatOpenAI | |
from llama_hub.file.pymu_pdf.base import PyMuPDFReader | |
from pathlib import Path | |
from typing import List | |
from pathlib import Path | |
import gradio as gr | |
def title(t): | |
thing = list(scrape_links[t[1]['link'] == scrape_links['Link']]['title ']) | |
if len(thing)>0: | |
return thing[0] | |
else: | |
return 'aghag' | |
documents = [Document( | |
text=t[1]['text'], | |
metadata={ | |
"file_name": title(t), | |
"URL" : t[1]['link'] | |
}) for t in all.iterrows()] | |
pdfs = [Document( | |
text=t[1]['Text'], | |
metadata={ | |
"file_name": t[1]['Title'], | |
"URL" : t[0] | |
}) for t in bycels.iterrows()] | |
#documents = [Document(text=t) for t in fin] | |
openai.api_key = os.getenv("OpenAI_API_Key") | |
llm_predictor = LLMPredictor( | |
llm=ChatOpenAI(temperature=0, model_name="gpt-4", streaming=True)) | |
service_context = ServiceContext.from_defaults( | |
llm_predictor=llm_predictor, | |
node_parser = SimpleNodeParser(text_splitter=TokenTextSplitter(chunk_size=chunk_sizes, chunk_overlap=chunk_overlaps))) | |
nodes = service_context.node_parser.get_nodes_from_documents(documents + pdfs) # + service_context.node_parser.get_nodes_from_documents(merged) | |
index2 = VectorStoreIndex(nodes) | |
# retu | |
from llama_index.memory import ChatMemoryBuffer | |
chat_engine = index2.as_chat_engine( | |
chat_mode="context", | |
memory=ChatMemoryBuffer.from_defaults(token_limit=1500), | |
system_prompt=system_message | |
) | |
def ret_chat(user_input): | |
response = chat_engine.chat( user_input) | |
source = response.sources | |
source_text = source[0].content | |
url_and_rest = source_text[source_text.index('URL'): ] | |
just_url =url_and_rest[: url_and_rest.index('\n\n') ] | |
return str(response.response) + '\n\n Source: ' + str(just_url) | |
chat_history = [] | |
def chat_interface(user_input): | |
response = ret_chat(user_input) | |
return chat_history + [(user_input, response)] | |
interface = gr.Interface( | |
fn=chat_interface, | |
inputs=gr.Textbox(label="How can you help me with Onboarding?"), | |
outputs=gr.Chatbot(), | |
title="Tufts Medicine Onboarding Helper", | |
description="Onboarding Bot" | |
) | |
interface.launch(debug=True) | |
import gradio as gr | |
chat_history = [] | |
def chat_interface(user_input): | |
global chat_history | |
response = chat_gast(chat_history, user_input) | |
chat_history.append((user_input, response)) | |
return chat_history | |
interface = gr.Interface( | |
fn=chat_interface, | |
inputs=gr.Textbox(label="How can you help me with Onboarding?"), | |
outputs=gr.Chatbot(), | |
live=True, | |
title="Tufts Medicine Onboarding Helper", | |
description="Onboarding Bot" | |
) | |
interface.launch(debug=True, auth = ('username', 'password')) | |
# lied about DUO | |
# lied about Mealstipend | |
# Car Permits | |
# made up millage reimbusmment | |
# made up tuition reimbusment | |
# 40 minute meal break | |
# workday platform | |