Spaces:
Sleeping
Sleeping
Praneeth Yerrapragada
commited on
Commit
•
5112690
1
Parent(s):
ea9333f
chore: remove open_ai_apikey in plain text
Browse files
app/categorization/categorizer.py
CHANGED
@@ -46,12 +46,13 @@ def fuzzy_match_list_categorizer(
|
|
46 |
"""
|
47 |
|
48 |
# Fuzzy-match this description against the reference descriptions
|
49 |
-
match_results = process.extractOne(
|
|
|
50 |
|
51 |
# If a match is found, return the category of the matched description
|
52 |
if match_results:
|
53 |
return description_category_pairs.at[match_results[2], 'category']
|
54 |
-
|
55 |
return None
|
56 |
|
57 |
|
@@ -69,10 +70,10 @@ async def llm_list_categorizer(tx_list: pd.DataFrame) -> pd.DataFrame:
|
|
69 |
"""
|
70 |
|
71 |
# Initialize language model and prompt
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
prompt = PromptTemplate.from_template(template=CATEGORY_TEMPLATE)
|
76 |
chain = LLMChain(llm=llm, prompt=prompt)
|
77 |
|
78 |
# Iterate over the DataFrame in batches of TX_PER_LLM_RUN transactions
|
@@ -87,7 +88,8 @@ async def llm_list_categorizer(tx_list: pd.DataFrame) -> pd.DataFrame:
|
|
87 |
valid_results = [result['output'] for result in results if result['valid']]
|
88 |
|
89 |
# Flatten the list of valid results to obtain a single list of description-category pairs
|
90 |
-
valid_outputs = [
|
|
|
91 |
|
92 |
# Return a DataFrame with the valid outputs
|
93 |
return pd.DataFrame(valid_outputs, columns=['name/description', 'category'])
|
@@ -120,7 +122,7 @@ async def llm_sublist_categorizer(
|
|
120 |
try:
|
121 |
# Create a pattern to match a list Description-Category pairs (List[Tuple[str, str]])
|
122 |
pattern = r"\['([^']+)', '([^']+)'\]"
|
123 |
-
|
124 |
# Use it to extract all the correctly formatted pairs from the raw result
|
125 |
matches = re.findall(pattern, raw_result.replace("\\'", "'"))
|
126 |
|
@@ -131,13 +133,15 @@ async def llm_sublist_categorizer(
|
|
131 |
parsed_pair = ast.literal_eval(str(list(match)))
|
132 |
valid_outputs.append(parsed_pair)
|
133 |
except Exception as e:
|
134 |
-
logger.log(logging.ERROR,
|
|
|
135 |
result['valid'] = False
|
136 |
|
137 |
result['output'] = valid_outputs
|
138 |
|
139 |
except Exception as e:
|
140 |
-
logging.log(
|
|
|
141 |
result['valid'] = False
|
142 |
-
|
143 |
-
return result
|
|
|
46 |
"""
|
47 |
|
48 |
# Fuzzy-match this description against the reference descriptions
|
49 |
+
match_results = process.extractOne(
|
50 |
+
description, descriptions, score_cutoff=threshold)
|
51 |
|
52 |
# If a match is found, return the category of the matched description
|
53 |
if match_results:
|
54 |
return description_category_pairs.at[match_results[2], 'category']
|
55 |
+
|
56 |
return None
|
57 |
|
58 |
|
|
|
70 |
"""
|
71 |
|
72 |
# Initialize language model and prompt
|
73 |
+
openai_api_key = os.environ['OPENAI_API_KEY']
|
74 |
+
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125",
|
75 |
+
api_key=openai_api_key)
|
76 |
+
prompt = PromptTemplate.from_template(template=CATEGORY_TEMPLATE)
|
77 |
chain = LLMChain(llm=llm, prompt=prompt)
|
78 |
|
79 |
# Iterate over the DataFrame in batches of TX_PER_LLM_RUN transactions
|
|
|
88 |
valid_results = [result['output'] for result in results if result['valid']]
|
89 |
|
90 |
# Flatten the list of valid results to obtain a single list of description-category pairs
|
91 |
+
valid_outputs = [
|
92 |
+
output for valid_result in valid_results for output in valid_result]
|
93 |
|
94 |
# Return a DataFrame with the valid outputs
|
95 |
return pd.DataFrame(valid_outputs, columns=['name/description', 'category'])
|
|
|
122 |
try:
|
123 |
# Create a pattern to match a list Description-Category pairs (List[Tuple[str, str]])
|
124 |
pattern = r"\['([^']+)', '([^']+)'\]"
|
125 |
+
|
126 |
# Use it to extract all the correctly formatted pairs from the raw result
|
127 |
matches = re.findall(pattern, raw_result.replace("\\'", "'"))
|
128 |
|
|
|
133 |
parsed_pair = ast.literal_eval(str(list(match)))
|
134 |
valid_outputs.append(parsed_pair)
|
135 |
except Exception as e:
|
136 |
+
logger.log(logging.ERROR,
|
137 |
+
f"Parsing Error: {e}\nMatch: {match}\n")
|
138 |
result['valid'] = False
|
139 |
|
140 |
result['output'] = valid_outputs
|
141 |
|
142 |
except Exception as e:
|
143 |
+
logging.log(
|
144 |
+
logging.ERROR, f"| File: {file_name} | Unexpected Error: {e}\nRaw Result: {raw_result}")
|
145 |
result['valid'] = False
|
146 |
+
|
147 |
+
return result
|