aklai commited on
Commit
52d7945
·
1 Parent(s): becc5ee

Update space

Browse files
Files changed (1) hide show
  1. app.py +2 -93
app.py CHANGED
@@ -23,70 +23,6 @@ from bs4 import BeautifulSoup
23
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
24
  """
25
 
26
- # List of files
27
- pdf_files = [
28
- #"RCP0032 Intake 10 Student Internship Summary Reports.pdf",
29
- #"Research Computing Platform Student Internship Handbook.pdf",
30
- #"Student Projects Outline - Summer 2425.pdf"
31
- ]
32
-
33
- urls = [
34
- 'https://wehi-researchcomputing.github.io/email_acknowledgement',
35
- 'https://wehi-researchcomputing.github.io/email-week-two',
36
- 'https://wehi-researchcomputing.github.io/code-of-conduct',
37
- 'https://wehi-researchcomputing.github.io/faq#i-need-to-use-storage-and-compute-what-are-my-options',
38
- 'https://wehi-researchcomputing.github.io/student-schex',
39
- 'https://wehi-researchcomputing.github.io/student-cryoem',
40
- 'https://wehi-researchcomputing.github.io/email-week-three',
41
- 'https://wehi-researchcomputing.github.io/students#key-documents-to-review-and-faq',
42
- 'https://wehi-researchcomputing.github.io/intake_dates',
43
- 'https://wehi-researchcomputing.github.io/assets/rcp_logo.png',
44
- 'https://wehi-researchcomputing.github.io/project-wikis',
45
- 'https://wehi-researchcomputing.github.io/student-flux',
46
- 'https://wehi-researchcomputing.github.io/explanation_about_ohs',
47
- 'https://wehi-researchcomputing.github.io/top-5-mistakes',
48
- 'https://wehi-researchcomputing.github.io/student-quantum',
49
- 'https://wehi-researchcomputing.github.io/student-immunology-web-application',
50
- 'https://wehi-researchcomputing.github.io/how-to-apply',
51
- 'https://wehi-researchcomputing.github.io/email-week-one',
52
- 'https://wehi-researchcomputing.github.io/student-genomics-metadata.html',
53
- 'https://wehi-researchcomputing.github.io/students',
54
- 'https://wehi-researchcomputing.github.io/student-haemosphere',
55
- 'https://wehi-researchcomputing.github.io/email-week-four',
56
- 'https://wehi-researchcomputing.github.io/faq#you-ask-us-to-be-as-independent-as-possible-how-can-we-do-that',
57
- 'https://wehi-researchcomputing.github.io/student-imaging',
58
- 'https://wehi-researchcomputing.github.io/faq#how-should-i-ask-for-help-to-solve-a-problem',
59
- 'https://wehi-researchcomputing.github.io/faq#how-do-i-get-access-to-the-wehi-wide-student-intern-group-using-my-wehi-email-address',
60
- 'https://wehi-researchcomputing.github.io/student-data-commons',
61
- 'https://wehi-researchcomputing.github.io/faq#what-do-i-need-to-do-for-my-final-presentation-and-summary-report',
62
- 'https://wehi-researchcomputing.github.io/complex-projects',
63
- 'https://wehi-researchcomputing.github.io/faq#onboarding-onto-the-internship-program-workday',
64
- 'https://wehi-researchcomputing.github.io/RDM-0220-RCP-Student-Internship-Handbook.pdf',
65
- 'https://wehi-researchcomputing.github.io/email-final-week',
66
- 'https://wehi-researchcomputing.github.io/faq#i-know-that-the-internships-are-usually-100-offsite-but-what-if-i-want-to-go-into-the-office-sometimes',
67
- 'https://wehi-researchcomputing.github.io/student-bionix',
68
- 'https://wehi-researchcomputing.github.io/student-duplex-sequencing',
69
- 'https://wehi-researchcomputing.github.io/social_media_policy',
70
- 'https://wehi-researchcomputing.github.io/email-week-ten',
71
- 'https://wehi-researchcomputing.github.io/student-aive',
72
- 'https://wehi-researchcomputing.github.io/software_maturity_model',
73
- 'https://wehi-researchcomputing.github.io/student-organiser',
74
- 'https://wehi-researchcomputing.github.io/expectations_open_source_contributors',
75
- 'https://wehi-researchcomputing.github.io/student-genomics-qc',
76
- 'https://wehi-researchcomputing.github.io/student-immunology-modelling',
77
- 'https://wehi-researchcomputing.github.io/faq',
78
- 'https://wehi-researchcomputing.github.io/student-genomics-invoicing',
79
- 'https://wehi-researchcomputing.github.io/emaiL-one-week-before',
80
- 'https://wehi-researchcomputing.github.io/student-capacity-planning.html',
81
- 'https://wehi-researchcomputing.github.io/email-week-five',
82
- 'https://wehi-researchcomputing.github.io/emails-and-key-milestones',
83
- 'https://wehi-researchcomputing.github.io/student-clinical-dashboards',
84
- 'https://wehi-researchcomputing.github.io/',
85
- 'https://wehi-researchcomputing.github.io/student-loxcoder',
86
- 'https://wehi-researchcomputing.github.io/student-mixOmics.html',
87
- "https://wehi-researchcomputing.github.io/faq#what-are-the-key-things-to-do-before-the-weekly-meetings"
88
- ]
89
-
90
  # LLM Model#
91
  llm = HuggingFacePipeline.from_model_id(
92
  model_id="meta-llama/Llama-3.2-3B",
@@ -97,35 +33,8 @@ llm = HuggingFacePipeline.from_model_id(
97
  # Initialize embedding model "all-MiniLM-L6-v2"
98
  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
99
 
100
- if os.path.exists("./chroma_db") and os.listdir("./chroma_db"):
101
- # Load the existing ChromaDB database
102
- vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)
103
- else:
104
- # Initialize an empty list to store all documents
105
- all_docs = []
106
-
107
- # Load PDFs with metadata
108
- for pdf_file in pdf_files:
109
- pdf_loader = PyPDFLoader(pdf_file)
110
- pdf_docs = pdf_loader.load()
111
- for doc in pdf_docs:
112
- doc.metadata["source"] = pdf_file # Add source metadata
113
- all_docs.extend(pdf_docs)
114
-
115
- # Load URLs with metadata
116
- for url in urls:
117
- url_loader = WebBaseLoader(url)
118
- web_docs = url_loader.load()
119
- for doc in web_docs:
120
- doc.metadata["source"] = url # Add source metadata
121
- all_docs.extend(web_docs)
122
-
123
- # Split documents into chunks
124
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
125
- chunks = text_splitter.split_documents(all_docs)
126
-
127
- vector_store = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db")
128
-
129
 
130
  # See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
131
  # Basically a solid prompt for RAG
 
23
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
24
  """
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  # LLM Model#
27
  llm = HuggingFacePipeline.from_model_id(
28
  model_id="meta-llama/Llama-3.2-3B",
 
33
  # Initialize embedding model "all-MiniLM-L6-v2"
34
  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
35
 
36
+ # Load the existing ChromaDB database
37
+ vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  # See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
40
  # Basically a solid prompt for RAG