Spaces:
Sleeping
Sleeping
alvinhenrick
commited on
Commit
•
3381d80
1
Parent(s):
13341a0
Use nuvocare/WikiMedical_sent_biobert
Browse files
medirag/core/reader.py
CHANGED
@@ -7,15 +7,13 @@ from llama_index.core.readers.base import BaseReader
|
|
7 |
|
8 |
def normalize_text(text):
|
9 |
"""Normalize the text by lowercasing, removing extra spaces, and stripping unnecessary characters."""
|
10 |
-
text = text.lower()
|
11 |
-
text = re.sub(r'\s+', ' ', text)
|
12 |
-
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
|
13 |
return text.strip()
|
14 |
|
15 |
|
16 |
def format_output_string(drug_name, sections_data):
|
17 |
-
|
18 |
-
output = [f"Drug Name: {drug_name}"]
|
19 |
|
20 |
for title, paragraphs in sections_data.items():
|
21 |
output.append(f"{title}:")
|
@@ -26,34 +24,48 @@ def format_output_string(drug_name, sections_data):
|
|
26 |
return "\n".join(output)
|
27 |
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def parse_drug_information(soup, extra_info=None):
|
30 |
# Extract the setId
|
31 |
set_id = None
|
32 |
set_id_tag = soup.find("setId")
|
33 |
if set_id_tag:
|
34 |
set_id = set_id_tag.get("root", None)
|
35 |
-
|
36 |
if not set_id:
|
37 |
return None
|
38 |
|
39 |
-
# Ensure structured body exists
|
40 |
structured_body = soup.find("structuredBody")
|
41 |
-
if not structured_body:
|
42 |
-
return None
|
43 |
|
44 |
# Extract the drug name
|
45 |
-
|
46 |
-
|
47 |
-
if
|
48 |
-
inner_product = manufactured_product.find("manufacturedProduct")
|
49 |
-
if inner_product:
|
50 |
-
name_tag = inner_product.find("name")
|
51 |
-
if name_tag:
|
52 |
-
drug_name = name_tag.get_text(strip=True)
|
53 |
-
|
54 |
-
if not drug_name:
|
55 |
return None
|
56 |
|
|
|
|
|
57 |
# Iterate over components and extract sections
|
58 |
components = structured_body.find_all("component")
|
59 |
sections_data = {}
|
@@ -62,14 +74,14 @@ def parse_drug_information(soup, extra_info=None):
|
|
62 |
sections = component.find_all("section")
|
63 |
for section in sections:
|
64 |
title_tag = section.find("title")
|
65 |
-
|
66 |
-
|
|
|
67 |
continue # Skip if title is not found
|
68 |
|
69 |
paragraphs = section.find_all("paragraph")
|
70 |
paragraphs_text = []
|
71 |
seen_paragraphs = set() # Set to track unique paragraphs
|
72 |
-
|
73 |
for paragraph in paragraphs:
|
74 |
paragraph_text = normalize_text(paragraph.get_text(strip=True))
|
75 |
if paragraph_text and paragraph_text not in seen_paragraphs:
|
@@ -79,7 +91,10 @@ def parse_drug_information(soup, extra_info=None):
|
|
79 |
# Only include sections with non-empty, non-duplicate paragraphs
|
80 |
if paragraphs_text:
|
81 |
if title_text in sections_data:
|
82 |
-
sections_data[title_text]
|
|
|
|
|
|
|
83 |
else:
|
84 |
sections_data[title_text] = paragraphs_text
|
85 |
|
|
|
7 |
|
8 |
def normalize_text(text):
|
9 |
"""Normalize the text by lowercasing, removing extra spaces, and stripping unnecessary characters."""
|
10 |
+
text = text.lower()
|
11 |
+
text = re.sub(r'\s+', ' ', text)
|
|
|
12 |
return text.strip()
|
13 |
|
14 |
|
15 |
def format_output_string(drug_name, sections_data):
|
16 |
+
output = [f"Drug and Generic Names: {drug_name}"]
|
|
|
17 |
|
18 |
for title, paragraphs in sections_data.items():
|
19 |
output.append(f"{title}:")
|
|
|
24 |
return "\n".join(output)
|
25 |
|
26 |
|
27 |
+
def extract_drug_and_generic_names(structured_body):
|
28 |
+
# Extract the main drug name and any generic names
|
29 |
+
drug_names = set() # Use a set to avoid duplicates
|
30 |
+
|
31 |
+
# Look for manufacturedProduct elements
|
32 |
+
manufactured_products = structured_body.find_all("manufacturedProduct")
|
33 |
+
|
34 |
+
for manufactured_product in manufactured_products:
|
35 |
+
# Extract the main drug name
|
36 |
+
name_tag = manufactured_product.find("name")
|
37 |
+
if name_tag:
|
38 |
+
drug_names.add(name_tag.get_text(strip=True))
|
39 |
+
|
40 |
+
# Extract the generic names if available
|
41 |
+
as_generic = manufactured_product.find("asEntityWithGeneric")
|
42 |
+
if as_generic:
|
43 |
+
generic_name_tag = as_generic.find("genericMedicine").find("name")
|
44 |
+
if generic_name_tag:
|
45 |
+
drug_names.add(generic_name_tag.get_text(strip=True))
|
46 |
+
|
47 |
+
return list(drug_names)
|
48 |
+
|
49 |
+
|
50 |
def parse_drug_information(soup, extra_info=None):
|
51 |
# Extract the setId
|
52 |
set_id = None
|
53 |
set_id_tag = soup.find("setId")
|
54 |
if set_id_tag:
|
55 |
set_id = set_id_tag.get("root", None)
|
|
|
56 |
if not set_id:
|
57 |
return None
|
58 |
|
|
|
59 |
structured_body = soup.find("structuredBody")
|
|
|
|
|
60 |
|
61 |
# Extract the drug name
|
62 |
+
drug_names = extract_drug_and_generic_names(structured_body)
|
63 |
+
|
64 |
+
if len(drug_names) == 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
return None
|
66 |
|
67 |
+
drug_name = " | ".join(drug_names)
|
68 |
+
|
69 |
# Iterate over components and extract sections
|
70 |
components = structured_body.find_all("component")
|
71 |
sections_data = {}
|
|
|
74 |
sections = component.find_all("section")
|
75 |
for section in sections:
|
76 |
title_tag = section.find("title")
|
77 |
+
if title_tag:
|
78 |
+
title_text = normalize_text(title_tag.get_text(strip=True))
|
79 |
+
else:
|
80 |
continue # Skip if title is not found
|
81 |
|
82 |
paragraphs = section.find_all("paragraph")
|
83 |
paragraphs_text = []
|
84 |
seen_paragraphs = set() # Set to track unique paragraphs
|
|
|
85 |
for paragraph in paragraphs:
|
86 |
paragraph_text = normalize_text(paragraph.get_text(strip=True))
|
87 |
if paragraph_text and paragraph_text not in seen_paragraphs:
|
|
|
91 |
# Only include sections with non-empty, non-duplicate paragraphs
|
92 |
if paragraphs_text:
|
93 |
if title_text in sections_data:
|
94 |
+
existing_paragraphs = set(sections_data[title_text])
|
95 |
+
# Add only unique paragraphs that aren't already in the title's list
|
96 |
+
unique_paragraphs = [p for p in paragraphs_text if p not in existing_paragraphs]
|
97 |
+
sections_data[title_text].extend(unique_paragraphs)
|
98 |
else:
|
99 |
sections_data[title_text] = paragraphs_text
|
100 |
|
medirag/index/local.py
CHANGED
@@ -6,7 +6,7 @@ from llama_index.vector_stores.faiss import FaissVectorStore
|
|
6 |
|
7 |
class DailyMedIndexer:
|
8 |
def __init__(self,
|
9 |
-
model_name="
|
10 |
dimension=768,
|
11 |
persist_dir="./storage"):
|
12 |
|
|
|
6 |
|
7 |
class DailyMedIndexer:
|
8 |
def __init__(self,
|
9 |
+
model_name="nuvocare/WikiMedical_sent_biobert",
|
10 |
dimension=768,
|
11 |
persist_dir="./storage"):
|
12 |
|
tests/data/daily_bio_bert_indexed/default__vector_store.json
CHANGED
Binary files a/tests/data/daily_bio_bert_indexed/default__vector_store.json and b/tests/data/daily_bio_bert_indexed/default__vector_store.json differ
|
|
tests/data/daily_bio_bert_indexed/docstore.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tests/data/daily_bio_bert_indexed/index_store.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"index_store/data": {"
|
|
|
1 |
+
{"index_store/data": {"094f4c0c-af8f-41d3-ae01-97383ed77ef4": {"__type__": "vector_store", "__data__": "{\"index_id\": \"094f4c0c-af8f-41d3-ae01-97383ed77ef4\", \"summary\": null, \"nodes_dict\": {\"0\": \"8ffb2edf-e849-4a7c-81b3-931ce1bf7161\", \"1\": \"52d7a57a-af68-48b3-8072-fd0894a44b11\", \"2\": \"61f61d11-9bab-4dc7-8965-9477f0fb2d57\", \"3\": \"dddc5c49-31f8-4a95-8282-1d590f88d378\", \"4\": \"df24fad1-9589-456e-a3f3-a746900a3acf\", \"5\": \"0d05f4df-6821-4721-aaf7-ae201ce84dd5\", \"6\": \"44e17ff4-c0a5-4452-b41f-146502ca4c5e\", \"7\": \"3d09a622-bdf5-42e3-b21d-dfa0eeb640d3\", \"8\": \"0da142cf-2955-459c-8048-68d1309c0a01\", \"9\": \"1fb064bc-8192-4054-a771-e04341351011\", \"10\": \"4c4a88b1-b177-4ee0-8eb1-b64ab4c4e61a\", \"11\": \"d4c5164c-2c34-40df-86a5-1d6d0c5014da\", \"12\": \"9d81c8c9-ffe7-43d7-bda3-a0175e8906c6\", \"13\": \"adf3dabc-710b-49ed-b10d-8568ed4c355a\", \"14\": \"2f061ed3-eac6-43dd-b466-b672dba5804f\", \"15\": \"c6bed9da-6c8e-4ae1-88d7-56e43fd9562d\", \"16\": \"ceb4a59f-5e0c-42d5-a72f-926b451c22ef\", \"17\": \"0745aeb0-c9c6-4dd5-8d18-8244687da1d3\", \"18\": \"351d3fde-c6bf-4842-9f6c-c0422a4cd689\", \"19\": \"e8b262d6-3ca2-4604-bde2-ec7b0d8711f0\", \"20\": \"4d50e2a6-254b-4cdd-abd1-6616a25f9f54\", \"21\": \"d1fb364e-7700-4105-98a0-5c01dbc8e641\", \"22\": \"196695c3-413c-4149-904d-f0b95dfd7b93\", \"23\": \"fb6f5450-8053-4e0a-ab63-242efb899602\", \"24\": \"cbd6dfc4-dd9a-4e1e-9365-082b3a5fb671\", \"25\": \"87463055-de4e-4855-949c-5603fdcee55b\", \"26\": \"d70e3a02-a4d8-4953-9538-6cf255e28cfd\", \"27\": \"ae9c59cd-358f-4072-a04e-6c0f351c5948\", \"28\": \"a3d98744-d4cb-4e9b-b5e8-571acf05e6a4\", \"29\": \"f0b89ab6-91bf-458c-a814-fbc97a2b02d3\", \"30\": \"0068d26e-dfa4-4de8-925b-994b2b1df8b0\", \"31\": \"99b9be23-2f37-4be7-ab33-ed6b9643d04c\", \"32\": \"3172f340-3d1a-454e-b7ab-bb259bc75e55\", \"33\": \"f66d20e6-71e3-4b5e-a243-760e5c3b1d76\", \"34\": \"af4f5974-c337-40d4-b284-4208eb168901\", \"35\": \"b4e742cc-d6f1-4693-85f4-4e45034d513c\", \"36\": \"5f9acbea-d72e-4c75-bb7f-8237dcb1738e\", \"37\": \"b85d9b91-0b23-4f20-b68b-a77235bbc9ba\", \"38\": \"ec13de43-0d6f-484d-a93c-1395140a9800\", \"39\": \"b5812945-2060-4def-909a-ce16ac78f892\", \"40\": \"a6fb69a0-7efb-4106-a302-c4244ac23fed\", \"41\": \"046680cc-52d0-4f88-9397-4ce05589f84a\", \"42\": \"a9628076-db3a-47ca-82d4-c04d666ab5b9\", \"43\": \"3ada7248-12ac-4f3f-b25f-0c9913e6af87\", \"44\": \"9c5d63f5-1e75-459d-b702-520ccfb0eb52\", \"45\": \"84785bef-b538-499b-87e6-300c138f451c\", \"46\": \"6e26a161-5b38-4c26-b6be-1a05a1ba8f40\", \"47\": \"de22b10f-841f-465a-9d64-630b0e710ae3\", \"48\": \"589c9677-c43f-4360-8a49-f516bcbb98f4\", \"49\": \"c1021753-9faf-4b35-aa35-84eebbae78a4\", \"50\": \"fe25f849-3fdd-4f35-9e41-e70e33a7740d\", \"51\": \"428bea6d-9cc7-4871-8e64-49e4fdd55f12\", \"52\": \"0a204ca0-640e-4de1-85d9-b22df0ae6e02\", \"53\": \"5b31446f-fc3c-4856-a284-6918343b9f50\", \"54\": \"a390735d-01c6-43bf-8c29-15f2733fb8f0\", \"55\": \"08985457-5a61-40d1-8b23-cb413c3db5a0\", \"56\": \"57df8a99-81f8-4056-8337-a19f6b3e9ffb\", \"57\": \"593e6722-c38c-4e30-902b-9fc946c09f2e\", \"58\": \"170328dc-90aa-4bee-98e9-bf08cc3cda78\", \"59\": \"d24e7b93-876f-4382-b2a3-184a589a5bc3\", \"60\": \"4af9d6a2-4ba0-434c-b454-d54c6ea0d94e\", \"61\": \"47898460-e6b0-449e-8557-f35167b92471\", \"62\": \"6cb0919c-e2d9-4495-8667-daeae6f0c33a\", \"63\": \"caad5c1b-6a83-4b63-8520-1fba8db207c0\", \"64\": \"cabb583c-2975-43d9-9746-5810cc5550a0\", \"65\": \"cca87e22-451c-4fd3-a2b3-82f7611698ba\", \"66\": \"757d5f26-40bc-4cee-9352-ec7b1f5b36cf\", \"67\": \"080a86d2-20aa-4312-88d9-84f81a7b8d37\", \"68\": \"adbd65dd-e126-4644-a048-7ae5c345cb3b\", \"69\": \"d8624bbf-7568-4958-b6cb-aace63327ab3\", \"70\": \"73484ba1-897c-4da3-8e11-6ad871edfeb5\", \"71\": \"c7763a74-c056-4eb9-8de3-38760ebbbcdc\", \"72\": \"d9781c9a-ef9d-4025-808a-48bc80904c7e\", \"73\": \"cf8051ac-a9d3-4b06-ae0a-b7e91db30bb3\", \"74\": \"6183ba0b-0f73-4ff7-af86-a41508919163\", \"75\": \"bcc6bd07-27ca-42a2-baaa-743be396b388\", \"76\": \"db7f2c11-6484-4560-9447-fbdba6bfc8b3\", \"77\": \"15aeceea-5583-4bd4-bf72-1f99d44dccf0\", \"78\": \"c19b7a7d-670a-433b-b3f1-db3844f8c31c\", \"79\": \"7b160bff-7c59-425c-8c5f-5a274002c80d\", \"80\": \"54f1bcd8-5d92-42fe-845d-7e1b29efc592\", \"81\": \"89c75206-319a-46b9-a72d-4863ebfa7d12\", \"82\": \"1f8b0334-cd9d-4425-9464-ce54fda3d572\", \"83\": \"f5cd70d7-32bd-4bef-a33c-916215627249\", \"84\": \"f8c93504-1ea1-420a-9e93-d82b144041c0\", \"85\": \"a28ba8b2-68dc-4e32-81f1-b6621f77c8a5\", \"86\": \"096f0a61-8c14-4ac2-ae0b-9d9b2aea2f5b\", \"87\": \"f1c6899c-5cc4-4bb6-a2ae-cded11d47357\", \"88\": \"bffd3cc5-b443-4c42-997f-854405298e51\", \"89\": \"03cecd87-4c0d-4182-9587-68031e63f2e5\", \"90\": \"28ea880f-0927-46be-9058-e33b10ef8894\", \"91\": \"e93c2059-d81c-403a-a0ce-d6ba3c1a6068\", \"92\": \"508f9468-f651-430a-87ee-bfd52e9426ff\", \"93\": \"5e17bb81-42a8-48c6-bbdb-b1c0555492c2\", \"94\": \"5238baab-3188-4685-9de5-91a038cf8e7f\", \"95\": \"f4910bc5-32b7-42b8-a49c-fa8871d7db60\", \"96\": \"f0e78d36-7291-4d66-8eb8-37dde194115f\", \"97\": \"a8cb9e2a-f555-416e-b453-a4ddf4ae5284\", \"98\": \"908038fa-5722-4c3c-9682-8ca30e5318aa\", \"99\": \"9785723b-794d-409e-9ced-df231094a7e8\", \"100\": \"e9eff912-b835-4fb4-80ca-b2575280c9c5\", \"101\": \"713709b2-93e2-4be7-9bf1-da4b064a9502\", \"102\": \"e866a0fa-18de-4995-ae5e-46fb08a68554\", \"103\": \"76d78cec-d6ab-4b01-b61c-4cf0c5b69b55\", \"104\": \"afb6cb30-4fc9-4902-a984-72cdca46614e\", \"105\": \"cad12ba2-304f-4de8-9a96-86ae0b7c96b5\", \"106\": \"47c42be1-1e10-489b-a9fa-540854c5beee\", \"107\": \"6fbbdd66-bd10-4fa9-80ee-b555f8064baf\", \"108\": \"9743a4de-e5fd-4620-988b-c5ad9b923619\", \"109\": \"8c58450c-9118-4f68-8f85-ee6d79b9c54d\", \"110\": \"b09bc864-e9a9-4d58-a72d-8392c28fcf02\", \"111\": \"c8fe7b54-bb5a-485c-8318-3c32bb320028\", \"112\": \"c082e938-8dc7-4613-b5ca-0db01c4e5605\", \"113\": \"dfbfc8b2-0a89-4074-af25-6b7a43854ac0\", \"114\": \"ef29d314-87f9-439e-b288-91a5823f9703\", \"115\": \"172a795b-648e-4918-84a6-9103f49401f2\", \"116\": \"26c30ed8-918e-4d1f-ac82-07cdebb9a96c\", \"117\": \"fa272094-6d24-4e10-97cc-b60aec0f3a02\", \"118\": \"34e56f49-415e-4ff4-8660-f3da4606241f\", \"119\": \"5af58b35-d439-46be-96e2-a5ff5d41f890\", \"120\": \"3f914cff-3c98-4c23-ac16-54f6dcf8c51d\", \"121\": \"88346af2-b503-4707-ab2f-5de0ec8796e5\", \"122\": \"2f7e4648-de88-4fc2-9953-50903dac145d\", \"123\": \"93b8369d-8777-4b69-a871-667448448fb6\", \"124\": \"d8850c13-8b86-42ca-93c0-2c978ecfcdff\", \"125\": \"d8288ea4-1320-41b6-a200-2f53a6b3fd7f\", \"126\": \"4f147167-2c3f-4fa6-8de2-73c93ae70008\", \"127\": \"2890a0d4-8405-448e-8894-419a6770aeac\", \"128\": \"ca4934eb-29b7-42d2-8b34-6133f0c890df\"}, \"doc_id_dict\": {}, \"embeddings_dict\": {}}"}}}
|