Spaces:

tall-tree
/

ai-virtual-assistant

Running on CPU Upgrade

App Files Files

yrobel-lima commited on Jul 30, 2024

Commit

956157f

verified ·

1 Parent(s): 2a351ef

Delete utils/data_processing.py

Browse files

Files changed (1) hide show

utils/data_processing.py +0 -77

utils/data_processing.py DELETED Viewed

@@ -1,77 +0,0 @@
-import pandas as pd
-def format_docs(docs):
-    """Print the contents of a list of Langchain Documents.
-    Args:
-        docs (str):
-    """
-    print(
-        f"\n{'-' * 100}\n".join(
-            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
-        )
-    )
-def clean_and_format_text(text):
-    if isinstance(text, str):
-        # Replace curly apostrophes with straight ones
-        text = text.replace("\u2019", "'")
-        words = text.split()
-        # Title case words, preserving acronyms
-        title_words = [
-            word if word.isupper() and len(word) > 1 else word.capitalize()
-            for word in words
-        ]
-        return " ".join(title_words)
-    else:
-        return text
-def categorize_location(location):
-    if any(place in location.lower() for place in ["cordova bay", "james bay"]):
-        return "Victoria"
-    return location
-def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
-    """Load an Excel file, clean its contents, and generate a pd.Dataframe.
-    Args:
-        data_directory (str): File path to the directory where the Excel file is located.
-    Raises:
-        FileNotFoundError: If no Excel files are found in the specified directory.
-    Returns:
-        pd.Dataframe:
-    """
-    # Get the xls file name (one excel worksheet)
-    excel_files = [file for file in data_directory.iterdir() if file.suffix == ".xlsx"]
-    if not excel_files:
-        raise FileNotFoundError("No Excel files found in the specified directory.")
-    if len(excel_files) > 1:
-        raise ValueError("More than one Excel file found in the specified directory.")
-    path = excel_files[0]
-    # Load Excel file
-    df = pd.read_excel(path, engine="openpyxl")
-    # Change column names to title case
-    df.columns = df.columns.str.title()
-    # Clean data
-    for col in df.columns:
-        if col.lower() != "booking link" and df[col].dtype == "object":
-            df[col] = df[col].str.strip().apply(clean_and_format_text)
-    # Handle missing values
-    df.fillna("Information Not Available", inplace=True)
-    # Add city column
-    df["City"] = df["Location"].apply(categorize_location)
-    return df