Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

beweinreich commited on 15 days ago

Commit

9323d71

•

1 Parent(s): 0e61327

switch to redis queue

Browse files

Files changed (12) hide show

.gitignore +1 -0
README.md +12 -4
algo.py +9 -3
audits/1720098227.csv +1 -0
audits/1720098318.csv +1 -0
db/db_utils.py +10 -3
kill_redis.py +34 -27
redis_queue.py +6 -0
requirements.txt +1 -0
run.py +20 -69
tasks.py +83 -9
tasks.py.orig +0 -44

.gitignore CHANGED Viewed

@@ -3,6 +3,7 @@
 *.pyc
 .env
 raw/*
 results/*
 logs/*
 specificity-model/*

 *.pyc
 .env
 raw/*
+raw copy/*
 results/*
 logs/*
 specificity-model/*

README.md CHANGED Viewed

@@ -27,12 +27,20 @@ Additionally, it handles various word forms and multi-term descriptions to maint
 ## Running
 ```
-# Start celery worker
-celery -A tasks worker --loglevel=info
 python run.py
-# clear tasks queue
-celery -A tasks purge
 ```
 ```

 ## Running
 ```
+# Start redis queue worker
+rq worker -c redis_queue
 python run.py
+# view rq-dashboard
+pip install rq-dashboard
+rq-dashboard --redis-url REDIS_URL
+```
+## Kill redis connections
+```
+heroku redis:cli -a brightly-ai-db
+CLIENT KILL TYPE normal
 ```
 ```

algo.py CHANGED Viewed

@@ -14,13 +14,19 @@ from db.db_utils import store_mapping_to_db, cached_get_mapping_from_db, get_dic
 from ask_gpt import query_gpt
 from multi_food_item_detector import extract_items, has_delimiters
 from mapping_template import empty_template, heterogeneous_template, multi_item_template, nonfood_template, usda_template
-from tasks import insert_result
 from specificity_classifier import classify_text_to_specificity
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 similarity_threshold = 0.78
 class Algo:
     def __init__(self, db_conn, run_key=None):
         self.db_conn = db_conn
@@ -342,11 +348,11 @@ class Algo:
                 results.append(mapping)
             if len(result_batch) >= 100:
-                insert_result(self.run_key, result_batch)
                 result_batch = []
         if len(result_batch) > 0:
-            insert_result(self.run_key, result_batch)
             result_batch = []

 from ask_gpt import query_gpt
 from multi_food_item_detector import extract_items, has_delimiters
 from mapping_template import empty_template, heterogeneous_template, multi_item_template, nonfood_template, usda_template
+# from tasks import insert_result
 from specificity_classifier import classify_text_to_specificity
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 similarity_threshold = 0.78
+def insert_result(db_conn, run_key, mappings):
+    db_cursor = db_conn.cursor()
+    for mapping in mappings:
+        store_result_to_db(db_cursor, db_conn, run_key, mapping)
 class Algo:
     def __init__(self, db_conn, run_key=None):
         self.db_conn = db_conn
                 results.append(mapping)
             if len(result_batch) >= 100:
+                insert_result(self.db_conn, self.run_key, result_batch)
                 result_batch = []
         if len(result_batch) > 0:
+            insert_result(self.db_conn, self.run_key, result_batch)
             result_batch = []

audits/1720098227.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ input_word,original_dictionary_word,new_dictionary_word

audits/1720098318.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ input_word,original_dictionary_word,new_dictionary_word

db/db_utils.py CHANGED Viewed

@@ -9,9 +9,16 @@ load_dotenv()
 def get_connection():
     DATABASE_URL = os.environ['DATABASE_URL']
-    conn = psycopg2.connect(DATABASE_URL, sslmode='require')
-    initialize_db(conn)
-    return conn
 def initialize_db(conn):
     cursor = conn.cursor()

 def get_connection():
     DATABASE_URL = os.environ['DATABASE_URL']
+    print(f"Connecting to database...")
+    try:
+        conn = psycopg2.connect(DATABASE_URL, sslmode='require')
+        initialize_db(conn)  # Ensure this function is defined and correctly initializes the database if needed
+        print("Database connection established")
+        return conn
+    except Exception as e:
+        print(f"Failed to connect to database: {e}")
+        raise
 def initialize_db(conn):
     cursor = conn.cursor()

kill_redis.py CHANGED Viewed

@@ -1,37 +1,44 @@
-import os
-import subprocess
-from dotenv import load_dotenv
-from redis import Redis
-load_dotenv()
-REDIS_URL = os.environ['REDIS_URL']
-redis_client = Redis.from_url(REDIS_URL)
-def kill_all_redis_clients():
-    try:
-        # Fetch Redis URL from environment variable
-        if not REDIS_URL:
-            raise ValueError("REDIS_URL is not set in the environment variables.")
-        # Extract host and port from the Redis URL
-        redis_host = REDIS_URL.split("//")[-1].split("@")[1].split(":")[0]
-        redis_port = REDIS_URL.split("//")[-1].split("@")[1].split(":")[1]
-        # Execute Redis CLI commands to list and kill clients
-        client_list_command = f'redis-cli -h {redis_host} -p {redis_port} CLIENT LIST'
-        client_list_output = subprocess.check_output(client_list_command, shell=True).decode('utf-8')
-        client_ids = [line.split(' ')[0].split('=')[1] for line in client_list_output.strip().split('\n') if 'id=' in line]
-        for client_id in client_ids:
-            kill_command = f'redis-cli -h {redis_host} -p {redis_port} CLIENT KILL ID {client_id}'
-            subprocess.check_call(kill_command, shell=True)
-        print("Successfully killed all Redis clients.")
-    except Exception as e:
-        print(f"An error occurred: {e}")
-if __name__ == "__main__":
-    kill_all_redis_clients()

+heroku redis:cli -a brightly-ai-db
+CLIENT KILL TYPE normal
+# import os
+# import subprocess
+# from dotenv import load_dotenv
+# from redis import Redis
+# load_dotenv()
+# REDIS_URL = os.environ['REDIS_URL']
+# redis_client = Redis.from_url(REDIS_URL)
+# def kill_all_redis_clients():
+#     try:
+#         # Fetch Redis URL from environment variable
+#         if not REDIS_URL:
+#             raise ValueError("REDIS_URL is not set in the environment variables.")
+#         # Extract host and port from the Redis URL
+#         redis_host = REDIS_URL.split("//")[-1].split("@")[1].split(":")[0]
+#         redis_port = REDIS_URL.split("//")[-1].split("@")[1].split(":")[1]
+#         # Execute Redis CLI commands to list and kill clients
+#         client_list_command = f'redis-cli -h {redis_host} -p {redis_port} CLIENT LIST'
+#         client_list_output = subprocess.check_output(client_list_command, shell=True).decode('utf-8')
+#         client_ids = [line.split(' ')[0].split('=')[1] for line in client_list_output.strip().split('\n') if 'id=' in line]
+#         for client_id in client_ids:
+#             kill_command = f'redis-cli -h {redis_host} -p {redis_port} CLIENT KILL ID {client_id}'
+#             subprocess.check_call(kill_command, shell=True)
+#         print("Successfully killed all Redis clients.")
+#     except Exception as e:
+#         print(f"An error occurred: {e}")
+# if __name__ == "__main__":
+#     kill_all_redis_clients()

redis_queue.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+REDIS_URL = os.environ['REDIS_URL']

requirements.txt CHANGED Viewed

@@ -13,6 +13,7 @@ psycopg2-binary==2.9.9
 python-dotenv==1.0.1
 python-Levenshtein==0.25.1
 redis==5.0.7
 requests==2.32.3
 sentence_transformers==3.0.1
 spacy==3.7.5

 python-dotenv==1.0.1
 python-Levenshtein==0.25.1
 redis==5.0.7
+rq==1.16.2
 requests==2.32.3
 sentence_transformers==3.0.1
 spacy==3.7.5

run.py CHANGED Viewed

@@ -1,14 +1,28 @@
 import os
 import time
 import cProfile
 import pstats
 import pandas as pd
 from algo import Algo
 from db.db_utils import get_connection
 if __name__ == "__main__":
-    db_conn = get_connection()
-    db_cursor = db_conn.cursor()
     # raw_file_name = 'food-forward-2022-raw-data.csv'
     # raw_file_name = 'MFB-2023-raw-data.csv'
@@ -20,74 +34,11 @@ if __name__ == "__main__":
     # for raw_file_name in ['sharing-excess-2020-raw-data.csv', 'sharing-excess-2021-raw-data.csv', 'sharing-excess-2022-raw-data.csv', 'sharing-excess-2023-raw-data.csv']:
     # for raw_file_name in ['spoonfuls-2023-Raw-Data.csv']:
     for raw_file_name in raw_files:
-        if not raw_file_name.endswith('.csv'):
-            continue
-        # chop off the extension for the results run key
-        # result_file_name = raw_file_name.split('.')[0]
-        # run_key = f"{result_file_name}-{int(time.time())}"
-        run_key = raw_file_name.split('.')[0]
-        print(f"Processing {raw_file_name}")
-        # Check if the file is in the run_meta table
-        db_cursor.execute('SELECT run_key FROM run_meta WHERE run_key = %s', (run_key,))
-        run_meta_row = db_cursor.fetchone()
-        if not run_meta_row:
-            # prompt the user for the organization_id and year
-            # the user can select from a list of organizations
-            db_cursor.execute('SELECT id, name FROM organizations')
-            organizations = db_cursor.fetchall()
-            for i, org in enumerate(organizations):
-                print(f"{i+1}. {org[1]}")
-            org_choice = int(input("Select an organization: "))
-            organization_id = organizations[org_choice-1][0]
-            year = int(input("Enter the year: "))
-            db_cursor.execute('INSERT INTO run_meta (run_key, organization_id, year) VALUES (%s, %s, %s)', (run_key, organization_id, year))
-            db_conn.commit()
-        # find the number of rows that were already processed associated with this run key
-        db_cursor.execute('SELECT run_row FROM results WHERE run_key = %s ORDER BY run_row DESC', (run_key,))
-        # get the last row that was processed
-        last_row = db_cursor.fetchone()
-        input_file_path = f'./raw/{raw_file_name}'
-        df_input = pd.read_csv(input_file_path)
-        # Convert column headers to lowercase
-        df_input.columns = df_input.columns.str.lower()
-        descriptions = df_input['description'].astype(str).tolist()
-        descriptions2 = df_input.get('description2', pd.Series([None] * len(df_input))).astype(str).tolist()
-        donors = df_input['donor'].astype(str).tolist()
-        dates = df_input['date'].astype(str).tolist()
-        weights = df_input['weight'].astype(str).tolist()
-        input_data = [(desc, desc2, i + 2, donor, date, weight) for i, (desc, desc2, donor, date, weight) in enumerate(zip(descriptions, descriptions2, donors, dates, weights))]
-        # run_row is the the last row from the CSV file that was processed, so let's offset from there
-        num_rows = len(input_data)
-        last_row_num = 0
-        if last_row:
-            last_row_num = last_row[0] - 1
-        # num_rows is the total number of rows in the CSV file but we add one row for the header
-        print(f"CSV has {num_rows+1} rows")
-        csv_complete = last_row_num >= num_rows
-        print("CSV is complete" if csv_complete else "CSV is not complete")
-        if not csv_complete:
-            print(f"Starting at row #{last_row_num + 1}")
-            input_data = input_data[last_row_num:]
-            algo = Algo(db_conn, run_key)
-            algo.match_words(input_data)
         # algo.match_words([['bananas']])
-    db_conn.close()

+# run.py
 import os
 import time
 import cProfile
 import pstats
 import pandas as pd
+from dotenv import load_dotenv
 from algo import Algo
 from db.db_utils import get_connection
+from tasks import process_file
+from redis import Redis
+from rq import Queue
+load_dotenv()
+REDIS_URL = os.environ['REDIS_URL']
+WORKER_TIMEOUT = 7200  # 2 hours
+redis_conn = Redis.from_url(REDIS_URL)
+q = Queue('default', connection=redis_conn)
 if __name__ == "__main__":
+    # db_conn = get_connection()
+    # db_cursor = db_conn.cursor()
     # raw_file_name = 'food-forward-2022-raw-data.csv'
     # raw_file_name = 'MFB-2023-raw-data.csv'
     # for raw_file_name in ['sharing-excess-2020-raw-data.csv', 'sharing-excess-2021-raw-data.csv', 'sharing-excess-2022-raw-data.csv', 'sharing-excess-2023-raw-data.csv']:
     # for raw_file_name in ['spoonfuls-2023-Raw-Data.csv']:
     for raw_file_name in raw_files:
+        job = q.enqueue(process_file, raw_file_name, job_timeout=WORKER_TIMEOUT)
+        print(f"Task enqueued with job ID: {job.id}")
+        # process_file.delay(raw_file_name)
         # algo.match_words([['bananas']])
+    # db_conn.close()

tasks.py CHANGED Viewed

@@ -1,14 +1,17 @@
 # tasks.py
 import os
 import logging
-from redis import Redis
 from dotenv import load_dotenv
-from celery import Celery
 from db.db_utils import get_connection, store_result_to_db
 load_dotenv()
-# REDIS_URL = os.environ['REDIS_URL']
 # app = Celery('tasks', broker=REDIS_URL, backend=REDIS_URL)
 # app.conf.update(
@@ -22,10 +25,81 @@ load_dotenv()
 # )
 # @app.task
-db_conn = get_connection()
-db_cursor = db_conn.cursor()
-def insert_result(run_key, mappings):
-    for mapping in mappings:
-        store_result_to_db(db_cursor, db_conn, run_key, mapping)
-    # db_conn.close()

 # tasks.py
 import os
 import logging
+import time
+import pandas as pd
+from algo import Algo
 from dotenv import load_dotenv
+from redis import Redis
+from rq import Queue
+# from celery import Celery
 from db.db_utils import get_connection, store_result_to_db
 load_dotenv()
 # app = Celery('tasks', broker=REDIS_URL, backend=REDIS_URL)
 # app.conf.update(
 # )
 # @app.task
+# def insert_result(db_conn, run_key, mappings):
+#     db_cursor = db_conn.cursor()
+#     for mapping in mappings:
+#         store_result_to_db(db_cursor, db_conn, run_key, mapping)
+# @app.task
+def process_file(raw_file_name):
+    print(f"Processing {raw_file_name}")
+    if not raw_file_name.endswith('.csv'):
+        return
+    # chop off the extension for the results run key
+    # result_file_name = raw_file_name.split('.')[0]
+    # run_key = f"{result_file_name}-{int(time.time())}"
+    run_key = raw_file_name.split('.')[0]
+    print(f"Processing {raw_file_name}")
+    db_conn = get_connection()
+    db_cursor = db_conn.cursor()
+    print("obtained db connection")
+    # Check if the file is in the run_meta table
+    db_cursor.execute('SELECT run_key FROM run_meta WHERE run_key = %s', (run_key,))
+    run_meta_row = db_cursor.fetchone()
+    if not run_meta_row:
+        # prompt the user for the organization_id and year
+        # the user can select from a list of organizations
+        db_cursor.execute('SELECT id, name FROM organizations')
+        organizations = db_cursor.fetchall()
+        for i, org in enumerate(organizations):
+            print(f"{i+1}. {org[1]}")
+        org_choice = int(input("Select an organization: "))
+        organization_id = organizations[org_choice-1][0]
+        year = int(input("Enter the year: "))
+        db_cursor.execute('INSERT INTO run_meta (run_key, organization_id, year) VALUES (%s, %s, %s)', (run_key, organization_id, year))
+        db_conn.commit()
+    # find the number of rows that were already processed associated with this run key
+    db_cursor.execute('SELECT run_row FROM results WHERE run_key = %s ORDER BY run_row DESC', (run_key,))
+    # get the last row that was processed
+    last_row = db_cursor.fetchone()
+    input_file_path = f'./raw/{raw_file_name}'
+    df_input = pd.read_csv(input_file_path)
+    # Convert column headers to lowercase
+    df_input.columns = df_input.columns.str.lower()
+    descriptions = df_input['description'].astype(str).tolist()
+    descriptions2 = df_input.get('description2', pd.Series([None] * len(df_input))).astype(str).tolist()
+    donors = df_input['donor'].astype(str).tolist()
+    dates = df_input['date'].astype(str).tolist()
+    weights = df_input['weight'].astype(str).tolist()
+    input_data = [(desc, desc2, i + 2, donor, date, weight) for i, (desc, desc2, donor, date, weight) in enumerate(zip(descriptions, descriptions2, donors, dates, weights))]
+    # run_row is the the last row from the CSV file that was processed, so let's offset from there
+    num_rows = len(input_data)
+    last_row_num = 0
+    if last_row:
+        last_row_num = last_row[0] - 1
+    # num_rows is the total number of rows in the CSV file but we add one row for the header
+    print(f"CSV has {num_rows+1} rows")
+    csv_complete = last_row_num >= num_rows
+    print("CSV is complete" if csv_complete else "CSV is not complete")
+    if not csv_complete:
+        print(f"Starting at row #{last_row_num + 1}")
+        input_data = input_data[last_row_num:]
+        algo = Algo(db_conn, run_key)
+        algo.match_words(input_data)

tasks.py.orig DELETED Viewed

@@ -1,44 +0,0 @@
-# tasks.py
-import os
-import logging
-from redis import Redis
-from dotenv import load_dotenv
-from celery import Celery
-from db.db_utils import get_connection, store_result_to_db
-load_dotenv()
-# REDIS_URL = os.environ['REDIS_URL']
-# app = Celery('tasks', broker=REDIS_URL, backend=REDIS_URL)
-# app.conf.update(
-#     result_expires=3600,
-#     task_serializer='json',
-#     result_serializer='json',
-#     accept_content=['json'],
-#     timezone='UTC',
-#     enable_utc=True,
-#     broker_connection_retry_on_startup=True
-# )
-<<<<<<< HEAD
-# @app.task
-db_conn = get_connection()
-db_cursor = db_conn.cursor()
-=======
-# @app.task
-db_conn = get_connection()
-db_cursor = db_conn.cursor()
->>>>>>> e938c9da41dfd18544b4ea8aa7107f56cf05b2f2
-def insert_result(run_key, mappings):
-    for mapping in mappings:
-        store_result_to_db(db_cursor, db_conn, run_key, mapping)
-<<<<<<< HEAD
-# db_conn.close()
-=======
-    # db_conn.close()
->>>>>>> e938c9da41dfd18544b4ea8aa7107f56cf05b2f2