Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Upload 2 files
Browse files- utils/data_processing.py +23 -10
- utils/update_vector_database.py +1 -8
utils/data_processing.py
CHANGED
@@ -14,6 +14,25 @@ def format_docs(docs):
|
|
14 |
)
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
|
18 |
"""Load an Excel file, clean its contents, and generate a pd.Dataframe.
|
19 |
|
@@ -46,21 +65,15 @@ def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
|
|
46 |
# Change column names to title case
|
47 |
df.columns = df.columns.str.title()
|
48 |
|
49 |
-
# Function to replace curly apostrophes with straight ones
|
50 |
-
def replace_apostrophes(text):
|
51 |
-
if isinstance(text, str):
|
52 |
-
return text.replace("\u2019", "'")
|
53 |
-
return text
|
54 |
-
|
55 |
# Clean data
|
56 |
-
# Trim strings, standardize text (convert to title case), and replace apostrophes
|
57 |
for col in df.columns:
|
58 |
-
# If the column is text-based
|
59 |
if col.lower() != 'booking link' and df[col].dtype == 'object':
|
60 |
-
|
61 |
-
df[col] = df[col].str.strip().str.title().apply(replace_apostrophes)
|
62 |
|
63 |
# Handle missing values
|
64 |
df.fillna('Information Not Available', inplace=True)
|
65 |
|
|
|
|
|
|
|
66 |
return df
|
|
|
14 |
)
|
15 |
|
16 |
|
17 |
+
def clean_and_format_text(text):
|
18 |
+
if isinstance(text, str):
|
19 |
+
# Replace curly apostrophes with straight ones
|
20 |
+
text = text.replace("\u2019", "'")
|
21 |
+
words = text.split()
|
22 |
+
# Title case words, preserving acronyms
|
23 |
+
title_words = [word if word.isupper() and len(word) > 1 else word.capitalize()
|
24 |
+
for word in words]
|
25 |
+
return ' '.join(title_words)
|
26 |
+
else:
|
27 |
+
return text
|
28 |
+
|
29 |
+
|
30 |
+
def categorize_location(location):
|
31 |
+
if any(place in location.lower() for place in ['cordova bay', 'james bay']):
|
32 |
+
return 'Victoria'
|
33 |
+
return location
|
34 |
+
|
35 |
+
|
36 |
def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
|
37 |
"""Load an Excel file, clean its contents, and generate a pd.Dataframe.
|
38 |
|
|
|
65 |
# Change column names to title case
|
66 |
df.columns = df.columns.str.title()
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
# Clean data
|
|
|
69 |
for col in df.columns:
|
|
|
70 |
if col.lower() != 'booking link' and df[col].dtype == 'object':
|
71 |
+
df[col] = df[col].str.strip().apply(clean_and_format_text)
|
|
|
72 |
|
73 |
# Handle missing values
|
74 |
df.fillna('Information Not Available', inplace=True)
|
75 |
|
76 |
+
# Add city column
|
77 |
+
df['City'] = df['Location'].apply(categorize_location)
|
78 |
+
|
79 |
return df
|
utils/update_vector_database.py
CHANGED
@@ -19,16 +19,9 @@ class DataProcessor:
|
|
19 |
def __init__(self, data_dir: Path):
|
20 |
self.data_dir = data_dir
|
21 |
|
22 |
-
@staticmethod
|
23 |
-
def categorize_location(location):
|
24 |
-
if any(place in location.lower() for place in ['cordova bay', 'james bay']):
|
25 |
-
return 'Victoria'
|
26 |
-
return location
|
27 |
-
|
28 |
def load_practitioners_data(self):
|
29 |
try:
|
30 |
df = excel_to_dataframe(self.data_dir)
|
31 |
-
df['City'] = df['Location'].apply(self.categorize_location)
|
32 |
practitioners_data = []
|
33 |
for idx, row in df.iterrows():
|
34 |
# I am using dot as a separator for text embeddings
|
@@ -195,7 +188,7 @@ def main():
|
|
195 |
tall_tree_dataset = processor.load_tall_tree_data()
|
196 |
|
197 |
# Set OpenAI embeddings model
|
198 |
-
# TODO: Test new embeddings
|
199 |
embeddings_model = "text-embedding-ada-002"
|
200 |
openai_embeddings = OpenAIEmbeddings(model=embeddings_model)
|
201 |
|
|
|
19 |
def __init__(self, data_dir: Path):
|
20 |
self.data_dir = data_dir
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
def load_practitioners_data(self):
|
23 |
try:
|
24 |
df = excel_to_dataframe(self.data_dir)
|
|
|
25 |
practitioners_data = []
|
26 |
for idx, row in df.iterrows():
|
27 |
# I am using dot as a separator for text embeddings
|
|
|
188 |
tall_tree_dataset = processor.load_tall_tree_data()
|
189 |
|
190 |
# Set OpenAI embeddings model
|
191 |
+
# TODO: Test new OpenAI text embeddings models
|
192 |
embeddings_model = "text-embedding-ada-002"
|
193 |
openai_embeddings = OpenAIEmbeddings(model=embeddings_model)
|
194 |
|