aklai
commited on
Commit
·
52d7945
1
Parent(s):
becc5ee
Update space
Browse files
app.py
CHANGED
@@ -23,70 +23,6 @@ from bs4 import BeautifulSoup
|
|
23 |
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
24 |
"""
|
25 |
|
26 |
-
# List of files
|
27 |
-
pdf_files = [
|
28 |
-
#"RCP0032 Intake 10 Student Internship Summary Reports.pdf",
|
29 |
-
#"Research Computing Platform Student Internship Handbook.pdf",
|
30 |
-
#"Student Projects Outline - Summer 2425.pdf"
|
31 |
-
]
|
32 |
-
|
33 |
-
urls = [
|
34 |
-
'https://wehi-researchcomputing.github.io/email_acknowledgement',
|
35 |
-
'https://wehi-researchcomputing.github.io/email-week-two',
|
36 |
-
'https://wehi-researchcomputing.github.io/code-of-conduct',
|
37 |
-
'https://wehi-researchcomputing.github.io/faq#i-need-to-use-storage-and-compute-what-are-my-options',
|
38 |
-
'https://wehi-researchcomputing.github.io/student-schex',
|
39 |
-
'https://wehi-researchcomputing.github.io/student-cryoem',
|
40 |
-
'https://wehi-researchcomputing.github.io/email-week-three',
|
41 |
-
'https://wehi-researchcomputing.github.io/students#key-documents-to-review-and-faq',
|
42 |
-
'https://wehi-researchcomputing.github.io/intake_dates',
|
43 |
-
'https://wehi-researchcomputing.github.io/assets/rcp_logo.png',
|
44 |
-
'https://wehi-researchcomputing.github.io/project-wikis',
|
45 |
-
'https://wehi-researchcomputing.github.io/student-flux',
|
46 |
-
'https://wehi-researchcomputing.github.io/explanation_about_ohs',
|
47 |
-
'https://wehi-researchcomputing.github.io/top-5-mistakes',
|
48 |
-
'https://wehi-researchcomputing.github.io/student-quantum',
|
49 |
-
'https://wehi-researchcomputing.github.io/student-immunology-web-application',
|
50 |
-
'https://wehi-researchcomputing.github.io/how-to-apply',
|
51 |
-
'https://wehi-researchcomputing.github.io/email-week-one',
|
52 |
-
'https://wehi-researchcomputing.github.io/student-genomics-metadata.html',
|
53 |
-
'https://wehi-researchcomputing.github.io/students',
|
54 |
-
'https://wehi-researchcomputing.github.io/student-haemosphere',
|
55 |
-
'https://wehi-researchcomputing.github.io/email-week-four',
|
56 |
-
'https://wehi-researchcomputing.github.io/faq#you-ask-us-to-be-as-independent-as-possible-how-can-we-do-that',
|
57 |
-
'https://wehi-researchcomputing.github.io/student-imaging',
|
58 |
-
'https://wehi-researchcomputing.github.io/faq#how-should-i-ask-for-help-to-solve-a-problem',
|
59 |
-
'https://wehi-researchcomputing.github.io/faq#how-do-i-get-access-to-the-wehi-wide-student-intern-group-using-my-wehi-email-address',
|
60 |
-
'https://wehi-researchcomputing.github.io/student-data-commons',
|
61 |
-
'https://wehi-researchcomputing.github.io/faq#what-do-i-need-to-do-for-my-final-presentation-and-summary-report',
|
62 |
-
'https://wehi-researchcomputing.github.io/complex-projects',
|
63 |
-
'https://wehi-researchcomputing.github.io/faq#onboarding-onto-the-internship-program-workday',
|
64 |
-
'https://wehi-researchcomputing.github.io/RDM-0220-RCP-Student-Internship-Handbook.pdf',
|
65 |
-
'https://wehi-researchcomputing.github.io/email-final-week',
|
66 |
-
'https://wehi-researchcomputing.github.io/faq#i-know-that-the-internships-are-usually-100-offsite-but-what-if-i-want-to-go-into-the-office-sometimes',
|
67 |
-
'https://wehi-researchcomputing.github.io/student-bionix',
|
68 |
-
'https://wehi-researchcomputing.github.io/student-duplex-sequencing',
|
69 |
-
'https://wehi-researchcomputing.github.io/social_media_policy',
|
70 |
-
'https://wehi-researchcomputing.github.io/email-week-ten',
|
71 |
-
'https://wehi-researchcomputing.github.io/student-aive',
|
72 |
-
'https://wehi-researchcomputing.github.io/software_maturity_model',
|
73 |
-
'https://wehi-researchcomputing.github.io/student-organiser',
|
74 |
-
'https://wehi-researchcomputing.github.io/expectations_open_source_contributors',
|
75 |
-
'https://wehi-researchcomputing.github.io/student-genomics-qc',
|
76 |
-
'https://wehi-researchcomputing.github.io/student-immunology-modelling',
|
77 |
-
'https://wehi-researchcomputing.github.io/faq',
|
78 |
-
'https://wehi-researchcomputing.github.io/student-genomics-invoicing',
|
79 |
-
'https://wehi-researchcomputing.github.io/emaiL-one-week-before',
|
80 |
-
'https://wehi-researchcomputing.github.io/student-capacity-planning.html',
|
81 |
-
'https://wehi-researchcomputing.github.io/email-week-five',
|
82 |
-
'https://wehi-researchcomputing.github.io/emails-and-key-milestones',
|
83 |
-
'https://wehi-researchcomputing.github.io/student-clinical-dashboards',
|
84 |
-
'https://wehi-researchcomputing.github.io/',
|
85 |
-
'https://wehi-researchcomputing.github.io/student-loxcoder',
|
86 |
-
'https://wehi-researchcomputing.github.io/student-mixOmics.html',
|
87 |
-
"https://wehi-researchcomputing.github.io/faq#what-are-the-key-things-to-do-before-the-weekly-meetings"
|
88 |
-
]
|
89 |
-
|
90 |
# LLM Model#
|
91 |
llm = HuggingFacePipeline.from_model_id(
|
92 |
model_id="meta-llama/Llama-3.2-3B",
|
@@ -97,35 +33,8 @@ llm = HuggingFacePipeline.from_model_id(
|
|
97 |
# Initialize embedding model "all-MiniLM-L6-v2"
|
98 |
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)
|
103 |
-
else:
|
104 |
-
# Initialize an empty list to store all documents
|
105 |
-
all_docs = []
|
106 |
-
|
107 |
-
# Load PDFs with metadata
|
108 |
-
for pdf_file in pdf_files:
|
109 |
-
pdf_loader = PyPDFLoader(pdf_file)
|
110 |
-
pdf_docs = pdf_loader.load()
|
111 |
-
for doc in pdf_docs:
|
112 |
-
doc.metadata["source"] = pdf_file # Add source metadata
|
113 |
-
all_docs.extend(pdf_docs)
|
114 |
-
|
115 |
-
# Load URLs with metadata
|
116 |
-
for url in urls:
|
117 |
-
url_loader = WebBaseLoader(url)
|
118 |
-
web_docs = url_loader.load()
|
119 |
-
for doc in web_docs:
|
120 |
-
doc.metadata["source"] = url # Add source metadata
|
121 |
-
all_docs.extend(web_docs)
|
122 |
-
|
123 |
-
# Split documents into chunks
|
124 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
|
125 |
-
chunks = text_splitter.split_documents(all_docs)
|
126 |
-
|
127 |
-
vector_store = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db")
|
128 |
-
|
129 |
|
130 |
# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
|
131 |
# Basically a solid prompt for RAG
|
|
|
23 |
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
24 |
"""
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
# LLM Model#
|
27 |
llm = HuggingFacePipeline.from_model_id(
|
28 |
model_id="meta-llama/Llama-3.2-3B",
|
|
|
33 |
# Initialize embedding model "all-MiniLM-L6-v2"
|
34 |
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
35 |
|
36 |
+
# Load the existing ChromaDB database
|
37 |
+
vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
|
40 |
# Basically a solid prompt for RAG
|