File size: 2,950 Bytes
3e11e62
eee55c2
3e11e62
 
 
 
 
d640554
3e11e62
eee55c2
0e1a332
3e11e62
0e1a332
 
 
3e11e62
 
0e1a332
3e11e62
 
 
eee55c2
0e1a332
3e11e62
0e1a332
 
 
 
87aa840
0e1a332
 
89fbd84
 
d640554
3e11e62
0e1a332
3e11e62
0e1a332
3e11e62
 
 
87aa840
0e1a332
 
3e11e62
0e1a332
 
 
 
 
d640554
89fbd84
 
 
 
0e1a332
3e11e62
 
0e1a332
 
d640554
0e1a332
 
d640554
89fbd84
3e11e62
 
0e1a332
 
eee55c2
0e1a332
3e11e62
0e1a332
 
 
3e11e62
eee55c2
0e1a332
3e11e62
0e1a332
 
 
 
 
 
eee55c2
3e11e62
0e1a332
 
 
 
3e11e62
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import gradio as gr
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer

def load_documents(file_path="study_materials"):
    documents = []
    for filename in os.listdir(file_path):
        path = os.path.join(file_path, filename)
        if filename.endswith(".pdf"):
            loader = PyMuPDFLoader(path)
            documents.extend(loader.load())
        elif filename.endswith(".txt"):
            loader = TextLoader(path)
            documents.extend(loader.load())
    return documents

def create_qa_system():
    try:
        # Load documents
        documents = load_documents()
        if not documents:
            raise ValueError("📚 No study materials found")
        
        # Text splitting
        text_splitter = CharacterTextSplitter(
            chunk_size=1100,
            chunk_overlap=200,
            separator="\n\n"
        )
        texts = text_splitter.split_documents(documents)
        
        # Embeddings
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        
        # Vector store
        db = FAISS.from_documents(texts, embeddings)
        
        # LLM setup with proper LangChain wrapper
        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
        pipe = pipeline(
            "text2text-generation",
            model="google/flan-t5-base",
            tokenizer=tokenizer,
            max_length=600,
            temperature=0.7,
            do_sample=True,
            top_k=50,
            device=-1
        )
        
        # Wrap pipeline in LangChain component
        llm = HuggingFacePipeline(pipeline=pipe)
        
        # Create QA chain
        return RetrievalQA.from_llm(
            llm=llm,
            retriever=db.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True
        )
    except Exception as e:
        raise gr.Error(f"Error: {str(e)}")

# Initialize system
try:
    qa = create_qa_system()
except Exception as e:
    print(f"Startup failed: {str(e)}")
    raise

def ask_question(question, history):
    try:
        result = qa.invoke({"query": question})
        answer = result["result"]
        sources = list({doc.metadata['source'] for doc in result['source_documents']})
        return f"{answer}\n\n📚 Sources: {', '.join(sources)}"
    except Exception as e:
        return f"Error: {str(e)[:150]}"

gr.ChatInterface(
    ask_question,
    title="Study Assistant",
    description="Upload PDF/TXT files in 'study_materials' folder and ask questions!",
    theme="soft"
).launch()