yrobel-lima commited on
Commit
956157f
1 Parent(s): 2a351ef

Delete utils/data_processing.py

Browse files
Files changed (1) hide show
  1. utils/data_processing.py +0 -77
utils/data_processing.py DELETED
@@ -1,77 +0,0 @@
1
- import pandas as pd
2
-
3
-
4
- def format_docs(docs):
5
- """Print the contents of a list of Langchain Documents.
6
- Args:
7
- docs (str):
8
- """
9
- print(
10
- f"\n{'-' * 100}\n".join(
11
- [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
12
- )
13
- )
14
-
15
-
16
- def clean_and_format_text(text):
17
- if isinstance(text, str):
18
- # Replace curly apostrophes with straight ones
19
- text = text.replace("\u2019", "'")
20
- words = text.split()
21
- # Title case words, preserving acronyms
22
- title_words = [
23
- word if word.isupper() and len(word) > 1 else word.capitalize()
24
- for word in words
25
- ]
26
- return " ".join(title_words)
27
- else:
28
- return text
29
-
30
-
31
- def categorize_location(location):
32
- if any(place in location.lower() for place in ["cordova bay", "james bay"]):
33
- return "Victoria"
34
- return location
35
-
36
-
37
- def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
38
- """Load an Excel file, clean its contents, and generate a pd.Dataframe.
39
-
40
- Args:
41
- data_directory (str): File path to the directory where the Excel file is located.
42
-
43
- Raises:
44
- FileNotFoundError: If no Excel files are found in the specified directory.
45
-
46
- Returns:
47
- pd.Dataframe:
48
-
49
- """
50
- # Get the xls file name (one excel worksheet)
51
- excel_files = [file for file in data_directory.iterdir() if file.suffix == ".xlsx"]
52
-
53
- if not excel_files:
54
- raise FileNotFoundError("No Excel files found in the specified directory.")
55
- if len(excel_files) > 1:
56
- raise ValueError("More than one Excel file found in the specified directory.")
57
-
58
- path = excel_files[0]
59
-
60
- # Load Excel file
61
- df = pd.read_excel(path, engine="openpyxl")
62
-
63
- # Change column names to title case
64
- df.columns = df.columns.str.title()
65
-
66
- # Clean data
67
- for col in df.columns:
68
- if col.lower() != "booking link" and df[col].dtype == "object":
69
- df[col] = df[col].str.strip().apply(clean_and_format_text)
70
-
71
- # Handle missing values
72
- df.fillna("Information Not Available", inplace=True)
73
-
74
- # Add city column
75
- df["City"] = df["Location"].apply(categorize_location)
76
-
77
- return df