# -*- coding: utf-8 -*-
"""Citation.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/11coAx2hnXJPn0vGl9_cNPhMtmUzCrIbS
"""

import pandas as pd
# read in Workday Data
onb = pd.read_excel('All_workday (1).xlsx')

# read in bycels.mobi URLs
bycels = pd.read_excel('TMC_bycels.xlsx')
bycels= bycels.set_index('URL')
bycels['Text'] = ''

# read in Workday URLs and Titles
scrape_links = pd.read_excel('TMC_scrape_links.xlsx')

# Read in TuftsMedicine Addresses
adrs = pd.read_excel('directions_full.xlsx')
all = pd.concat([onb.rename(columns = {"web-scraper-start-url": 'link'}), adrs.rename(columns = {'web-scraper-start-url': 'link'})])

import requests
import io
import PyPDF2

for url in bycels.index:
  response = requests.get(url)
  pdf_io_bytes = io.BytesIO(response.content)

  text_list = []
  pdf = PyPDF2.PdfReader(pdf_io_bytes)
  num_pages = len(pdf.pages)
  for page in range(num_pages):
      page_text = pdf.pages[page].extract_text()
      text_list.append(page_text)
  text = "\n".join(text_list)
  bycels.loc[url]['Text'] = text

# PARAMETER
chunk_sizes = 512
chunk_overlaps = 64
system_message = 'You are a Chatbot that helps new employees with onboarding tasks like setting up a direct deposit, logging into workday. Please give many precise and detailed instructions which answer the most recent question directly and never under any circumstance provide links or URLs. If the questions is not directly answered in the sources say: "Sorry I dont know the answer to that" and do not agree with the question unless there is direct evidence'

from llama_index import Document
from llama_index.node_parser import SimpleNodeParser
from llama_index import VectorStoreIndex
from llama_index import LLMPredictor, VectorStoreIndex, ServiceContext
from langchain import OpenAI
import os
import openai
from llama_index import Document, ListIndex

from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index import VectorStoreIndex, ServiceContext, LLMPredictor
from llama_index.query_engine import PandasQueryEngine, RetrieverQueryEngine, CitationQueryEngine
from llama_index.retrievers import RecursiveRetriever
from llama_index.schema import IndexNode
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.indices.query.query_transform.base import HyDEQueryTransform
from llama_index.query_engine.transform_query_engine import TransformQueryEngine
from langchain.chat_models import ChatOpenAI
from llama_hub.file.pymu_pdf.base import PyMuPDFReader
from pathlib import Path
from typing import List
from pathlib import Path
import gradio as gr
def title(t):
  thing = list(scrape_links[t[1]['link'] == scrape_links['Link']]['title '])
  if len(thing)>0:
     return thing[0]
  else:
     return 'aghag'
documents = [Document(
    text=t[1]['text'],
    metadata={
        "file_name": title(t),
        "URL" : t[1]['link']
    }) for t in all.iterrows()]

pdfs = [Document(
    text=t[1]['Text'],
    metadata={
        "file_name": t[1]['Title'],
        "URL" : t[0]
    }) for t in bycels.iterrows()]

#documents = [Document(text=t) for t in fin]


openai.api_key = os.getenv("OpenAI_API_Key")

llm_predictor = LLMPredictor(
    llm=ChatOpenAI(temperature=0, model_name="gpt-4", streaming=True))

service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor,
    node_parser = SimpleNodeParser(text_splitter=TokenTextSplitter(chunk_size=chunk_sizes, chunk_overlap=chunk_overlaps)))
nodes = service_context.node_parser.get_nodes_from_documents(documents + pdfs)  # + service_context.node_parser.get_nodes_from_documents(merged)

index2 = VectorStoreIndex(nodes)

# retu

from llama_index.memory import ChatMemoryBuffer
chat_engine = index2.as_chat_engine(
    chat_mode="context",
    memory=ChatMemoryBuffer.from_defaults(token_limit=1500),
    system_prompt=system_message
  )
def ret_chat(user_input):
  response = chat_engine.chat( user_input)
  source = response.sources
  source_text = source[0].content
  url_and_rest = source_text[source_text.index('URL'): ]
  just_url =url_and_rest[: url_and_rest.index('\n\n') ]
  return str(response.response) + '\n\n Source: ' + str(just_url)

chat_history = []

def chat_interface(user_input):
    response = ret_chat(user_input)
    return chat_history + [(user_input, response)]

interface = gr.Interface(
    fn=chat_interface,
    inputs=gr.Textbox(label="How can you help me with Onboarding?"),
    outputs=gr.Chatbot(),
    title="Tufts Medicine Onboarding Helper",
    description="Onboarding Bot"
)

interface.launch(debug=True)

import gradio as gr

chat_history = []

def chat_interface(user_input):
    global chat_history
    response = chat_gast(chat_history, user_input)
    chat_history.append((user_input, response))
    return chat_history


interface = gr.Interface(
    fn=chat_interface,
    inputs=gr.Textbox(label="How can you help me with Onboarding?"),
    outputs=gr.Chatbot(),
    live=True,
    title="Tufts Medicine Onboarding Helper",
    description="Onboarding Bot"
)

interface.launch(debug=True, auth = ('username', 'password'))


# lied about DUO
# lied about Mealstipend
# Car Permits
# made up millage reimbusmment
# made up tuition reimbusment
# 40 minute meal break
# workday platform