Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- app.py +102 -0
- data/saved_pdf.pdf +0 -0
- db/default__vector_store.json +1 -0
- db/docstore.json +1 -0
- db/graph_store.json +1 -0
- db/image__vector_store.json +1 -0
- db/index_store.json +1 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from llama_index.core import StorageContext, load_index_from_storage, VectorStoreIndex, SimpleDirectoryReader, ChatPromptTemplate
|
3 |
+
from llama_index.llms.huggingface import HuggingFaceInferenceAPI
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
6 |
+
from llama_index.core import Settings
|
7 |
+
import os
|
8 |
+
import base64
|
9 |
+
import altair as alt
|
10 |
+
|
11 |
+
# Load environment variables
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
# Configure the Llama index settings
|
15 |
+
Settings.llm = HuggingFaceInferenceAPI(
|
16 |
+
model_name="google/gemma-1.1-7b-it",
|
17 |
+
tokenizer_name="google/gemma-1.1-7b-it",
|
18 |
+
context_window=3000,
|
19 |
+
token=os.getenv("HF_TOKEN"),
|
20 |
+
max_new_tokens=512,
|
21 |
+
generate_kwargs={"temperature": 0.1},
|
22 |
+
)
|
23 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
24 |
+
model_name="BAAI/bge-small-en-v1.5"
|
25 |
+
)
|
26 |
+
|
27 |
+
# Define the directory for persistent storage and data
|
28 |
+
PERSIST_DIR = "./db"
|
29 |
+
DATA_DIR = "data"
|
30 |
+
|
31 |
+
# Ensure data directory exists
|
32 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
33 |
+
os.makedirs(PERSIST_DIR, exist_ok=True)
|
34 |
+
|
35 |
+
def displayPDF(file):
|
36 |
+
with open(file, "rb") as f:
|
37 |
+
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
38 |
+
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
|
39 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
40 |
+
|
41 |
+
def data_ingestion():
|
42 |
+
documents = SimpleDirectoryReader(DATA_DIR).load_data()
|
43 |
+
storage_context = StorageContext.from_defaults()
|
44 |
+
index = VectorStoreIndex.from_documents(documents)
|
45 |
+
index.storage_context.persist(persist_dir=PERSIST_DIR)
|
46 |
+
|
47 |
+
def handle_query(query):
|
48 |
+
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
|
49 |
+
index = load_index_from_storage(storage_context)
|
50 |
+
chat_text_qa_msgs = [
|
51 |
+
(
|
52 |
+
"user",
|
53 |
+
"""created by vivek created for Neonflake Enterprises OPC Pvt Ltd
|
54 |
+
Context:
|
55 |
+
{context_str}
|
56 |
+
Question:
|
57 |
+
{query_str}
|
58 |
+
"""
|
59 |
+
)
|
60 |
+
]
|
61 |
+
text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
|
62 |
+
|
63 |
+
query_engine = index.as_query_engine(text_qa_template=text_qa_template)
|
64 |
+
answer = query_engine.query(query)
|
65 |
+
|
66 |
+
if hasattr(answer, 'response'):
|
67 |
+
return answer.response
|
68 |
+
elif isinstance(answer, dict) and 'response' in answer:
|
69 |
+
return answer['response']
|
70 |
+
else:
|
71 |
+
return "Sorry, I couldn't find an answer."
|
72 |
+
|
73 |
+
|
74 |
+
# Streamlit app initialization
|
75 |
+
st.title("Chat with your PDF📄")
|
76 |
+
st.markdown("Built by [vivek](https://github.com/saravivek-cyber)")
|
77 |
+
st.markdown("chat here")
|
78 |
+
|
79 |
+
if 'messages' not in st.session_state:
|
80 |
+
st.session_state.messages = [{'role': 'assistant', "content": 'Hello! Upload a PDF and ask me anything about its content.'}]
|
81 |
+
|
82 |
+
with st.sidebar:
|
83 |
+
st.title("Menu:")
|
84 |
+
uploaded_file = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button")
|
85 |
+
if st.button("Submit & Process"):
|
86 |
+
with st.spinner("Processing..."):
|
87 |
+
filepath = "data/saved_pdf.pdf"
|
88 |
+
with open(filepath, "wb") as f:
|
89 |
+
f.write(uploaded_file.getbuffer())
|
90 |
+
# displayPDF(filepath) # Display the uploaded PDF
|
91 |
+
data_ingestion() # Process PDF every time new file is uploaded
|
92 |
+
st.success("Done")
|
93 |
+
|
94 |
+
user_prompt = st.chat_input("Ask me anything about the content of the PDF:")
|
95 |
+
if user_prompt:
|
96 |
+
st.session_state.messages.append({'role': 'user', "content": user_prompt})
|
97 |
+
response = handle_query(user_prompt)
|
98 |
+
st.session_state.messages.append({'role': 'assistant', "content": response})
|
99 |
+
|
100 |
+
for message in st.session_state.messages:
|
101 |
+
with st.chat_message(message['role']):
|
102 |
+
st.write(message['content'])
|
data/saved_pdf.pdf
ADDED
Binary file (64.9 kB). View file
|
|
db/default__vector_store.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"embedding_dict": {"860863cb-ba90-4287-9ff3-de812a7cf04a": [-0.04935232177376747, 0.014276032336056232, -0.00677516171708703, -0.007499701343476772, 0.0857427641749382, 0.03596746549010277, -0.06747723370790482, 0.035292088985443115, 0.0009802288841456175, 0.011373339220881462, 0.024754010140895844, -0.03062259964644909, 0.011178512126207352, 0.01658494584262371, -0.025193003937602043, 0.007735987193882465, -0.026199763640761375, -0.004099538084119558, 0.03002157434821129, 0.018483860418200493, 0.008464190177619457, -0.03511177375912666, 0.021300099790096283, 0.045198313891887665, 0.02103796787559986, 0.04658903181552887, -0.03597500920295715, -0.07936180382966995, -0.048395734280347824, -0.23985832929611206, 0.05988171696662903, 0.00787472166121006, 0.06542612612247467, 0.016137035563588142, -0.029961371794342995, 0.016481216996908188, -0.020172743126749992, 0.05845439434051514, 0.029444292187690735, 0.0327472984790802, -0.060793109238147736, -0.021447496488690376, -0.010194200091063976, -0.03895203024148941, 0.002724932273849845, -0.04428384453058243, -0.04992429539561272, -0.020091135054826736, -0.008397594094276428, 0.006678513251245022, -0.04897148534655571, -0.060120902955532074, 0.00495715020224452, 0.031236156821250916, -0.024438021704554558, -0.001887921360321343, 0.08040959388017654, 0.0212861318141222, 0.019926222041249275, 0.041057415306568146, 0.0645257979631424, 0.029546035453677177, -0.13904771208763123, 0.01744740828871727, 0.06266944855451584, 0.041324276477098465, -0.0977046936750412, -0.042026087641716, -0.02397708222270012, 0.000938264187425375, -0.031055675819516182, 0.022282985970377922, 0.039871398359537125, 0.011208692565560341, 0.059094201773405075, 0.01761738955974579, 0.03747517243027687, 0.002703616861253977, 0.015252131968736649, -0.0339764729142189, 0.034126173704862595, 0.05502382293343544, 0.020618971437215805, -0.030696270987391472, -0.013595390133559704, -0.02856854349374771, 0.0403427854180336, -0.02784808911383152, -0.004025080241262913, -0.011623065918684006, -0.023647073656320572, -0.021383848041296005, -0.03101491369307041, 0.05079283565282822, -0.04112066701054573, -0.03585713356733322, -0.02295582741498947, 0.0062613822519779205, 0.02795836143195629, 0.45919111371040344, 0.015642572194337845, 0.010819172486662865, -0.018927421420812607, 0.05333065986633301, -0.013381321914494038, -0.06688794493675232, 0.004536801483482122, -0.02615642547607422, -0.024168578907847404, -0.004391009919345379, 0.02736389823257923, 0.02036883682012558, -0.040253669023513794, -0.03790125250816345, 0.007962658070027828, -0.0071958815678954124, 0.049702417105436325, -0.01769251562654972, -0.007585907820612192, -0.012356655672192574, 0.008346965536475182, -0.025942832231521606, 0.030699746683239937, -0.031913790851831436, 0.05911710113286972, -0.05253056809306145, 0.03923376277089119, 0.08774489164352417, 0.038656190037727356, -0.001138997613452375, 0.027105676010251045, 0.027531057596206665, -0.08285865932703018, 0.01835499331355095, -0.00720310490578413, -0.030862700194120407, -0.0021355219651013613, -0.016528071835637093, 0.007597050163894892, 0.02883450873196125, -0.011319754645228386, 0.03828631341457367, -0.007711352314800024, -0.09212921559810638, -0.12980586290359497, 0.153176948428154, 0.011540068313479424, 0.03229355439543724, -0.036570385098457336, -0.05308893322944641, 0.04680844396352768, 0.07722964882850647, -0.019974086433649063, 0.01124553382396698, 0.03856383636593819, 0.031176432967185974, -0.019309746101498604, -0.023881902918219566, -0.020723534747958183, 0.03380262106657028, -0.018595557659864426, -0.030544748529791832, 0.021867908537387848, 0.10692565888166428, 0.01082476507872343, -0.032912179827690125, -0.007422571070492268, 0.044559549540281296, -0.02630840614438057, 0.006297847256064415, -0.005424784496426582, -0.043432921171188354, 0.018201762810349464, 0.016728423535823822, -0.03545304760336876, 0.036799076944589615, -0.00962867308408022, -0.01837957464158535, -0.005215011071413755, 0.006207386497408152, -0.003111738944426179, -0.014922741800546646, -0.07831963896751404, 0.011641087010502815, 0.03601595386862755, 0.01461033709347248, -0.018093876540660858, -0.01139074470847845, 0.004400675185024738, 0.06439895927906036, 0.043229930102825165, -0.05553846061229706, -0.007091710343956947, 0.03263641893863678, -0.01781558059155941, 0.010551278479397297, -0.010003015398979187, -0.005920502822846174, -0.0057300967164337635, -0.06755722314119339, 0.03704983368515968, 0.0756269097328186, 0.03213987126946449, 0.0392606295645237, -0.02188468724489212, 0.005344125907868147, -0.007023293059319258, 0.006461248733103275, 0.05830821394920349, 0.02524745464324951, -0.06875360757112503, 0.0077574849128723145, -0.008623662404716015, 0.017420368269085884, -0.014371651224792004, 0.01098068617284298, 0.015060738660395145, 0.0575602725148201, -0.015285676345229149, 0.055246952921152115, -0.0046562557108700275, -0.0018068264471367002, -0.017236808314919472, -0.3138227164745331, -0.03679979592561722, 0.025411024689674377, 0.04189690202474594, -0.01532608363777399, -0.0593300499022007, 0.007799589075148106, 0.017681816592812538, 0.004527967423200607, 0.014501117169857025, 0.03831024095416069, 0.0643090233206749, -0.06463051587343216, -0.06307888776063919, -0.018941743299365044, -0.014238502830266953, 0.007583525497466326, -0.017396165058016777, -0.032528795301914215, 0.04459691792726517, 0.016582539305090904, 0.002379573881626129, -0.0007301989244297147, -0.01501720491796732, 0.07269556820392609, -0.03588758409023285, 0.1032787337899208, -0.06350622326135635, 0.006171433720737696, 0.015278211794793606, -0.02456526644527912, 0.03500528261065483, -0.03471928462386131, -0.03898708149790764, 0.023004775866866112, -0.028884489089250565, -0.057739198207855225, -0.0035506936255842447, -0.019762180745601654, -0.039818767458200455, -0.020245185121893883, -0.001681070076301694, 0.03693488612771034, -0.08100935816764832, -0.03178160637617111, -0.030177531763911247, -0.020812533795833588, -0.014491626992821693, -0.01819487474858761, -0.00693318247795105, -0.025987470522522926, 0.0203871913254261, 0.03322084620594978, 0.007113815750926733, 0.01175146084278822, 0.011802028864622116, -0.04891163483262062, 0.03362104669213295, -0.07420346140861511, -0.07266935706138611, 0.00424754386767745, -0.05110837519168854, 0.03763461858034134, -0.0029107211157679558, -0.0017669596709311008, -0.009217919781804085, -0.0223727747797966, -0.02276541478931904, -0.005709769204258919, -0.018832406029105186, -0.018558891490101814, 0.09701454639434814, -3.134470171062276e-05, 0.013778149150311947, 0.025061175227165222, 0.06504049152135849, -0.0010301598813384771, -0.0897194966673851, -0.02937312051653862, -0.010941596701741219, 0.0018644781084731221, 0.05590954050421715, 0.026075152680277824, 0.05139530077576637, -0.010917403735220432, 0.00870157778263092, 0.015995193272829056, 0.006445177365094423, 0.04837393760681152, 0.025667952373623848, -0.008727463893592358, -0.029190072789788246, -0.018183276057243347, -0.0621129646897316, 0.020885098725557327, 0.06874728947877884, -0.23344504833221436, -0.02258511632680893, -0.01419254019856453, 0.10320448130369186, -0.005822952836751938, 0.0024899616837501526, 0.023092474788427353, -0.0038407070096582174, -0.013031963258981705, 0.03569323942065239, -0.049997638911008835, 0.029806140810251236, 0.009199211373925209, -0.058769576251506805, -0.03448537364602089, 0.04246056452393532, 0.07039237767457962, -0.04602818936109543, 0.06294197589159012, 0.004651046358048916, -0.001652638427913189, -0.03949356824159622, 0.12305214256048203, -0.02445746771991253, -0.04061185196042061, -0.0042878249660134315, 0.011829865165054798, -0.019710924476385117, 0.028901495039463043, 0.033081069588661194, -0.010725338943302631, -0.02745557762682438, 0.10162729769945145, 0.05656793341040611, 0.016987605020403862, 0.025119241327047348, 0.004949004389345646, -0.061499010771512985, 0.0024393266066908836, 0.01115493569523096, 0.026186874136328697, -0.03403957560658455, 0.015666430816054344, 0.018578195944428444, 0.054081711918115616, 0.009812631644308567, -0.008632104843854904, -0.08555403351783752, 0.00032727871439419687, 0.013853251934051514, -0.04474908858537674, -0.022167515009641647, -0.008224789053201675, 0.01202460192143917, 0.05611058324575424, 0.009806507267057896, 0.03089478611946106, 0.008607840165495872, -0.023285187780857086, -0.045362312346696854, 0.031830571591854095, -0.02046814002096653, 0.01958327554166317, -0.022897496819496155, -0.02404063194990158]}, "text_id_to_ref_doc_id": {"860863cb-ba90-4287-9ff3-de812a7cf04a": "ac226b84-1585-4759-add3-dc5d0af6ef65"}, "metadata_dict": {"860863cb-ba90-4287-9ff3-de812a7cf04a": {"page_label": "1", "file_name": "saved_pdf.pdf", "file_path": "E:\\llama-index RAG\\data\\saved_pdf.pdf", "file_type": "application/pdf", "file_size": 64903, "creation_date": "2024-04-14", "last_modified_date": "2024-04-17", "_node_type": "TextNode", "document_id": "ac226b84-1585-4759-add3-dc5d0af6ef65", "doc_id": "ac226b84-1585-4759-add3-dc5d0af6ef65", "ref_doc_id": "ac226b84-1585-4759-add3-dc5d0af6ef65"}}}
|
db/docstore.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"docstore/metadata": {"ac226b84-1585-4759-add3-dc5d0af6ef65": {"doc_hash": "2c629fa1f2e1e85f17b7d012739aea7cba30cdd55f935ed2225710942132eabf"}, "860863cb-ba90-4287-9ff3-de812a7cf04a": {"doc_hash": "2c629fa1f2e1e85f17b7d012739aea7cba30cdd55f935ed2225710942132eabf", "ref_doc_id": "ac226b84-1585-4759-add3-dc5d0af6ef65"}}, "docstore/data": {"860863cb-ba90-4287-9ff3-de812a7cf04a": {"__data__": {"id_": "860863cb-ba90-4287-9ff3-de812a7cf04a", "embedding": null, "metadata": {"page_label": "1", "file_name": "saved_pdf.pdf", "file_path": "E:\\llama-index RAG\\data\\saved_pdf.pdf", "file_type": "application/pdf", "file_size": 64903, "creation_date": "2024-04-14", "last_modified_date": "2024-04-17"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "ac226b84-1585-4759-add3-dc5d0af6ef65", "node_type": "4", "metadata": {"page_label": "1", "file_name": "saved_pdf.pdf", "file_path": "E:\\llama-index RAG\\data\\saved_pdf.pdf", "file_type": "application/pdf", "file_size": 64903, "creation_date": "2024-04-14", "last_modified_date": "2024-04-17"}, "hash": "2c629fa1f2e1e85f17b7d012739aea7cba30cdd55f935ed2225710942132eabf", "class_name": "RelatedNodeInfo"}}, "text": "3.DataProblem\nThisdocumentoutlinesthespecificinstructionsforpreparingtheprovideddatabaseofhumanvoice\nrecordingsfortrainingamachinelearningmodelcapableofdistinguishingbetweenauthenticand\nsyntheticvoices.\n1.DataExplorationandAnalysis:\n\uf0fc UtilizetoolssuchasMatplotlibandSeabornforin-depthdataanalysisandvisualization.\n\uf0fc Beginwithacomprehensiveexplorationofthedatabase,understandingcharacteristics,and\nassessingthedistributionofauthenticandsyntheticsamples.\n\uf0fc Identifyandaddressimbalancedsamplesinthedataset.\n2.ImbalanceHandling:\n\uf0fc Enhancemodelperformancebyemployingtechniquessuchasoversamplingorundersampling,\ne.g.,usingSMOTEorImblearn.\n3.DataCleaning:\n\uf0fc Addressvariationsinsamplewavlengthbyfindingthemeanoftotalsamplelengths.\n\uf0fc Utilizepaddingtechniquestostandardizeeachsampletothefixedmeanlength.\n\uf0fc Handlemisclassifiedsampleswithinthedataset.\n4.FeatureEngineering:\n\uf0fc ExtractrelevantacousticfeatureslikeMFCCs,spectrograms,andpitchfromaudiorecordings.\n\uf0fc Experimentwithdifferentfeaturesetstoidentifythemostdiscriminativeones.\n\uf0fc Normalizeandstandardizefeaturesforconsistentscaling,facilitatingmodeltraining.\n5.SpeakerEmbeddings:\n\uf0fc Considerincorporatingspeakerembeddingstocaptureindividualcharacteristics,enhancingthe\nmodel'sabilitytogeneralizeacrossdiversevoices.\n\uf0fc Implementsuitablemethodsforextractingspeakerembeddings,suchaspre-trainedmodelsor\ntrainingonthedataset.\n6.DataSplitting:\n\uf0fc Splitthedataintotraining,validation,andtestsets,ensuringastratifiedsplit.\n\uf0fc Evaluatemodelperformanceonthevalidationset,minimizinglossbeforefinaltestingonthe\ntestsamples.\n7.DataAugmentation:\n\uf0fc Applydataaugmentationtechniquestoincreasemodelrobustnessagainstvariationsin\nrecordingconditions.\n\uf0fc Techniquesmayincluderandompitchshifts,time-stretching,orintroducingbackgroundnoise.\n8.QualityControl:\n\uf0fc Conductarigorousqualitycontrolchecktoidentifyandaddressanomaliesoroutliersinthe\ndataset.\n\uf0fc Verifythatdatapreprocessingstepsdonotintroduceartifactsnegativelyaffectingmodel\nperformance.\nOncethedataispreparedfollowingtheseguidelines,thetransitionintothemodeldevelopment\nphasewillfocusonselectinganappropriatearchitecture,trainingthemodel,andfine-tuningitfor\noptimalperformance.", "start_char_idx": 0, "end_char_idx": 2150, "text_template": "{metadata_str}\n\n{content}", "metadata_template": "{key}: {value}", "metadata_seperator": "\n", "class_name": "TextNode"}, "__type__": "1"}}, "docstore/ref_doc_info": {"ac226b84-1585-4759-add3-dc5d0af6ef65": {"node_ids": ["860863cb-ba90-4287-9ff3-de812a7cf04a"], "metadata": {"page_label": "1", "file_name": "saved_pdf.pdf", "file_path": "E:\\llama-index RAG\\data\\saved_pdf.pdf", "file_type": "application/pdf", "file_size": 64903, "creation_date": "2024-04-14", "last_modified_date": "2024-04-17"}}}}
|
db/graph_store.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"graph_dict": {}}
|
db/image__vector_store.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"embedding_dict": {}, "text_id_to_ref_doc_id": {}, "metadata_dict": {}}
|
db/index_store.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"index_store/data": {"01282bcc-a355-4256-b245-78ace39871e9": {"__type__": "vector_store", "__data__": "{\"index_id\": \"01282bcc-a355-4256-b245-78ace39871e9\", \"summary\": null, \"nodes_dict\": {\"860863cb-ba90-4287-9ff3-de812a7cf04a\": \"860863cb-ba90-4287-9ff3-de812a7cf04a\"}, \"doc_id_dict\": {}, \"embeddings_dict\": {}}"}}}
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
python-dotenv
|
3 |
+
llama-index
|
4 |
+
llama-index-embeddings-huggingface
|
5 |
+
llama-index-llms-huggingface
|