from llama_index.core import SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings
import torch
import os
import logging
import sys

import gradio as gr
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    load_index_from_storage,
    StorageContext,
)
from llama_index.core import Settings
from llama_index.llms.huggingface import HuggingFaceLLM
import torch
from llama_index.core.prompts.prompts import SimpleInputPrompt
from llama_index.core import ServiceContext
import os
from dotenv import load_dotenv, find_dotenv
import openai
import sys
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
reader= SimpleDirectoryReader(
    input_files=["[The Oxford Series in Electrical and Computer Engineering] Ding, Zhi; Lathi, Bhagwandas Pannalal - Modern digital and analog communication systems (2018;2019, Oxford University Press, USA)(Z-Lib.io).pdf",
                 "Lin_K_IT data.pdf","DR.Hadeer Info.pdf"])
docs = reader.load_data()
os.environ["OPENAI_API_KEY"]='sk-proj-FIDzQfmAoZxcQhGVDCYyT3BlbkFJGEmkijeCLp9x9Fn3kqwv'
openai.api_key  = os.environ["OPENAI_API_KEY"]
index = VectorStoreIndex.from_documents(docs)
retriever = index.as_retriever(similarity_top_k=10)
query_engine = index.as_query_engine()
system_prompt = """You are a communication Engineer  assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided.
You should strive to write the report as long as you can using all relevant and necessary information provided a minimum of 200 words.
""" 

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")
llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.3, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",

    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.bfloat16}
)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    node_parser=SentenceSplitter(chunk_size=512, chunk_overlap=20),
    num_output=512,
    context_window=2048,
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
)
def predict(input, history):
  response = query_engine.query(input)
  return str(response)
gr.ChatInterface(predict).launch(share=True)