mathtext-wormhole-staging

Build error

App Files Files Community

cetinca commited on Jan 24, 2023

Commit

b31816e

•

1 Parent(s): ba14999

Draft: Dev package

Browse files

Files changed (11) hide show

.gitlab-ci.yml +4 -4
app.py +5 -4
data/master_test_text2int.csv +0 -90
{modules → mathtext_fastapi}/__init__.py +0 -0
{data → mathtext_fastapi/data}/text2int_results.csv +3 -1
{modules → mathtext_fastapi}/nlu.py +9 -14
modules/sentiment.py +0 -8
modules/text2int.py +0 -192
pyproject.toml +43 -0
requirements.txt +3 -12
tests/test_text2int.py +11 -6

.gitlab-ci.yml CHANGED Viewed

@@ -1,14 +1,14 @@
 # Official Python language image.
-test_py39:
-  image: python:3.9
   before_script:
     - python -v
     - pip install -r requirements.txt
   script:
     - pytest --verbose
-test_py38:
-  image: python:3.8
   before_script:
     - python -v
     - pip install -r requirements.txt

 # Official Python language image.
+test_py38:
+  image: python:3.8
   before_script:
     - python -v
     - pip install -r requirements.txt
   script:
     - pytest --verbose
+test_py39:
+  image: python:3.9
   before_script:
     - python -v
     - pip install -r requirements.txt

app.py CHANGED Viewed

@@ -6,11 +6,11 @@ from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
-from pydantic import BaseModel
-from modules.nlu import prepare_message_data_for_logging
 from mathtext.sentiment import sentiment
 from mathtext.text2int import text2int
 app = FastAPI()
@@ -67,7 +67,7 @@ async def evaluate_user_message_with_nlu_api(request: Request):
     int_api_resp = text2int(message_text)
-    if int_api_resp == '32202':
         sentiment_api_resp = sentiment(message_text)
         # [{'label': 'POSITIVE', 'score': 0.991188645362854}]
         sent_data_dict = {'type': 'sentiment', 'data': sentiment_api_resp[0]['label']}
@@ -76,4 +76,5 @@ async def evaluate_user_message_with_nlu_api(request: Request):
     prepare_message_data_for_logging(message_data)
     int_data_dict = {'type': 'integer', 'data': int_api_resp}
     return JSONResponse(content=int_data_dict)

 from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
 from mathtext.sentiment import sentiment
 from mathtext.text2int import text2int
+from pydantic import BaseModel
+from mathtext_fastapi.nlu import prepare_message_data_for_logging
 app = FastAPI()
     int_api_resp = text2int(message_text)
+    if int_api_resp == 32202:
         sentiment_api_resp = sentiment(message_text)
         # [{'label': 'POSITIVE', 'score': 0.991188645362854}]
         sent_data_dict = {'type': 'sentiment', 'data': sentiment_api_resp[0]['label']}
     prepare_message_data_for_logging(message_data)
     int_data_dict = {'type': 'integer', 'data': int_api_resp}
     return JSONResponse(content=int_data_dict)

data/master_test_text2int.csv DELETED Viewed

@@ -1,90 +0,0 @@
-input,output
-fourteen,14
-forteen,14
-one thousand four hundred ninety two,1492
-one thousand ninety two,1092
-Fourteen Hundred Ninety-Two,1492
-Fourteen Hundred,1400
-Ninety nine,99
-fifteen thousand five hundred-sixty,15560
-three hundred fifty,350
-one nine eight five,1985
-nineteen eighty-five,1985
-oh one,1
-six oh 1,601
-sex,6
-six,6
-eight oh,80
-eighty,80
-ate,8
-double eight,88
-eight three seven five three O nine,8375309
-eight three seven five three oh nine,8375309
-eight three seven five three zero nine,8375309
-eight three seven five three oh ni-ee-ine,8375309
-two eight,28
-seven oh eleven,7011
-seven elevens,77
-seven eleven,711
-ninety nine oh five,9905
-seven 0 seven 0 seven 0 seven,7070707
-123 hundred,123000
-5 o 5,505
-15 o 5,1505
-15-o 5,1505
-15 o-5,1505
-911-thousand,911000
-twenty-two twenty-two,2222
-twenty-two twenty-twos,484
-four eighty four,484
-four eighties,320
-four eighties and nine nineties,1130
-ninety nine hundred and seventy seven,9977
-seven thousands,7000
-2 hundreds,200
-99 thousands and one,99001
-"forty-five thousand, seven hundred and nine",45709
-eighty eight hundred eighty,8880
-a hundred hundred,10000
-a hundred thousand,100000
-a hundred million,100000000
-nineteen ninety nine,1999
-forteen twenty seven,1427
-seventeen-thousand and seventy two,17072
-two hundred and nine,209
-two thousand ten,2010
-two thousand and ten,2010
-twelve million,12000000
-8 billion,8000000000
-twenty ten,2010
-thirty-two hundred,3200
-nine,9
-forty two,42
-1 2 three,123
-fourtean,14
-one tousand four hundred ninty two,1492
-Furteen Hundrd Ninety-Too,1492
-forrteen,14
-sevnteen-thosand and seventy two,17072
-ninety nine hundred ad seventy seven,9977
-seven thusands,7000
-2 hunreds,200
-99 tousands and one,99001
-eighty ate hundred eighty,8880
-fourteen Hundred,1400
-8 Bilion,8000000000
-one million three thousand one,1003001
-four million nine thousand seven,4009007
-two million five hundred thousand,2500000
-two tousand ten,2010
-two thousand teen,2010
-tvelve milion,12000000
-tventy ten,2010
-tirty-twoo hunred,3200
-sevn thoosands,7000
-five,5
-ten,10
-one two three and ten,12310
-ONE MILLion three hunded and fiv,1000305
-"50,500 and six",50506
-one_million_and_five,1000005

{modules → mathtext_fastapi}/__init__.py RENAMED Viewed

File without changes

{data → mathtext_fastapi/data}/text2int_results.csv RENAMED Viewed

@@ -1,4 +1,6 @@
 input,output,text2int,score
 fourteen,14,14,True
 forteen,14,14,True
 one thousand four hundred ninety two,1492,1492,True
@@ -21,7 +23,7 @@ double eight,88,32202,False
 eight three seven five three O nine,8375309,8375329,False
 eight three seven five three oh nine,8375309,8375309,True
 eight three seven five three zero nine,8375309,8375309,True
-eight three seven five three oh ni-ee-ine,8375309,837530619,False
 two eight,28,16,False
 seven oh eleven,7011,77,False
 seven elevens,77,77,True

 input,output,text2int,score
+notanumber,32202,32202,True
+this is not a number,32202,32202,True
 fourteen,14,14,True
 forteen,14,14,True
 one thousand four hundred ninety two,1492,1492,True
 eight three seven five three O nine,8375309,8375329,False
 eight three seven five three oh nine,8375309,8375309,True
 eight three seven five three zero nine,8375309,8375309,True
+eight three seven five three oh ni-ee-ine,8375309,837530611,False
 two eight,28,16,False
 seven oh eleven,7011,77,False
 seven elevens,77,77,True

{modules → mathtext_fastapi}/nlu.py RENAMED Viewed

@@ -1,18 +1,13 @@
-import environ
-import json
 import os
-import requests
 from datetime import datetime
 from supabase import create_client
-BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-env = environ.Env()
-env_path = os.path.join(BASE_DIR, '.env')
-environ.Env.read_env('.env')
-SUPA = create_client(env('SUPABASE_URL'), env('SUPABASE_KEY'))
 def log_message_data_through_supabase_api(table_name, log_data):
     return SUPA.table(table_name).insert(log_data).execute()
@@ -28,19 +23,19 @@ def prepare_message_data_for_logging(message_data):
         # Autogenerated fields: id, created_at, modified_at
     }
     project_data_log = log_message_data_through_supabase_api('project', project_data)
     contact_data = {
-        'project': project_data_log.data[0]['id'], # FK
         'original_contact_id': message_data['message']['_vnd']['v1']['chat']['contact_uuid'],
         'urn': "",
         'language_code': "en",
         'contact_inserted_at': format_datetime_in_isoformat(datetime.now())
-        # Autogenerated fields: id, created_at, modified_at
     }
     contact_data_log = log_message_data_through_supabase_api('contact', contact_data)
     message_data = {
-        'contact': contact_data_log.data[0]['id'], # FK
         'original_message_id': message_data['message']['id'],
         'text': message_data['message']['text']['body'],
         'direction': message_data['message']['_vnd']['v1']['direction'],
@@ -49,6 +44,6 @@ def prepare_message_data_for_logging(message_data):
         'message_inserted_at': message_data['message']['_vnd']['v1']['chat']['inserted_at'],
         'message_modified_at': message_data['message']['_vnd']['v1']['chat']['updated_at'],
         'message_sent_at': format_datetime_in_isoformat(datetime.now())
-        # Autogenerated fields: created_at, modified_at
     }
     message_data_log = log_message_data_through_supabase_api('message', message_data)

 import os
 from datetime import datetime
+from dotenv import load_dotenv
 from supabase import create_client
+load_dotenv()
+SUPA = create_client(os.environ.get('SUPABASE_URL'), os.environ.get('SUPABASE_KEY'))
 def log_message_data_through_supabase_api(table_name, log_data):
     return SUPA.table(table_name).insert(log_data).execute()
         # Autogenerated fields: id, created_at, modified_at
     }
     project_data_log = log_message_data_through_supabase_api('project', project_data)
     contact_data = {
+        'project': project_data_log.data[0]['id'],  # FK
         'original_contact_id': message_data['message']['_vnd']['v1']['chat']['contact_uuid'],
         'urn': "",
         'language_code': "en",
         'contact_inserted_at': format_datetime_in_isoformat(datetime.now())
+        # Autogenerated fields: id, created_at, modified_at
     }
     contact_data_log = log_message_data_through_supabase_api('contact', contact_data)
     message_data = {
+        'contact': contact_data_log.data[0]['id'],  # FK
         'original_message_id': message_data['message']['id'],
         'text': message_data['message']['text']['body'],
         'direction': message_data['message']['_vnd']['v1']['direction'],
         'message_inserted_at': message_data['message']['_vnd']['v1']['chat']['inserted_at'],
         'message_modified_at': message_data['message']['_vnd']['v1']['chat']['updated_at'],
         'message_sent_at': format_datetime_in_isoformat(datetime.now())
+        # Autogenerated fields: created_at, modified_at
     }
     message_data_log = log_message_data_through_supabase_api('message', message_data)

modules/sentiment.py DELETED Viewed

@@ -1,8 +0,0 @@
-from transformers import pipeline
-sentiment_obj = pipeline(task="sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
-def sentiment(text):
-    # Returns sentiment value
-    return sentiment_obj(text)

modules/text2int.py DELETED Viewed

@@ -1,192 +0,0 @@
-import spacy  # noqa
-# import os
-# os.environ['KMP_DUPLICATE_LIB_OK']='True'
-# import spacy
-# Change this according to what words should be corrected to
-SPELL_CORRECT_MIN_CHAR_DIFF = 2
-TOKENS2INT_ERROR_INT = 32202
-ONES = [
-    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
-    "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
-    "sixteen", "seventeen", "eighteen", "nineteen",
-]
-CHAR_MAPPING = {
-    "-": " ",
-    "_": " ",
-    "and": " ",
-}
-# CHAR_MAPPING.update((str(i), word) for i, word in enumerate([" " + s + " " for s in ONES]))
-TOKEN_MAPPING = {
-    "and": " ",
-    "oh": "0",
-}
-def find_char_diff(a, b):
-    # Finds the character difference between two str objects by counting the occurences of every character. Not edit distance.
-    char_counts_a = {}
-    char_counts_b = {}
-    for char in a:
-        if char in char_counts_a.keys():
-            char_counts_a[char] += 1
-        else:
-            char_counts_a[char] = 1
-    for char in b:
-        if char in char_counts_b.keys():
-            char_counts_b[char] += 1
-        else:
-            char_counts_b[char] = 1
-    char_diff = 0
-    for i in char_counts_a:
-        if i in char_counts_b.keys():
-            char_diff += abs(char_counts_a[i] - char_counts_b[i])
-        else:
-            char_diff += char_counts_a[i]
-    return char_diff
-def tokenize(text):
-    text = text.lower()
-    # print(text)
-    text = replace_tokens(''.join(i for i in replace_chars(text)).split())
-    # print(text)
-    text = [i for i in text if i != ' ']
-    # print(text)
-    output = []
-    for word in text:
-        # print(word)
-        output.append(convert_word_to_int(word))
-    output = [i for i in output if i != ' ']
-    # print(output)
-    return output
-def detokenize(tokens):
-    return ' '.join(tokens)
-def replace_tokens(tokens, token_mapping=TOKEN_MAPPING):
-    return [token_mapping.get(tok, tok) for tok in tokens]
-def replace_chars(text, char_mapping=CHAR_MAPPING):
-    return [char_mapping.get(c, c) for c in text]
-def convert_word_to_int(in_word, numwords={}):
-    # Converts a single word/str into a single int
-    tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
-    scales = ["hundred", "thousand", "million", "billion", "trillion"]
-    if not numwords:
-        for idx, word in enumerate(ONES):
-            numwords[word] = idx
-        for idx, word in enumerate(tens):
-            numwords[word] = idx * 10
-        for idx, word in enumerate(scales):
-            numwords[word] = 10 ** (idx * 3 or 2)
-    if in_word in numwords:
-        # print(in_word)
-        # print(numwords[in_word])
-        return numwords[in_word]
-    try:
-        int(in_word)
-        return int(in_word)
-    except ValueError:
-        pass
-    # Spell correction using find_char_diff
-    char_diffs = [find_char_diff(in_word, i) for i in ONES + tens + scales]
-    min_char_diff = min(char_diffs)
-    if min_char_diff <= SPELL_CORRECT_MIN_CHAR_DIFF:
-        return char_diffs.index(min_char_diff)
-def tokens2int(tokens):
-    # Takes a list of tokens and returns a int representation of them
-    types = []
-    for i in tokens:
-        if i <= 9:
-            types.append(1)
-        elif i <= 90:
-            types.append(2)
-        else:
-            types.append(3)
-    # print(tokens)
-    if len(tokens) <= 3:
-        current = 0
-        for i, number in enumerate(tokens):
-            if i != 0 and types[i] < types[i - 1] and current != tokens[i - 1] and types[i - 1] != 3:
-                current += tokens[i] + tokens[i - 1]
-            elif current <= tokens[i] and current != 0:
-                current *= tokens[i]
-            elif 3 not in types and 1 not in types:
-                current = int(''.join(str(i) for i in tokens))
-                break
-            elif '111' in ''.join(str(i) for i in types) and 2 not in types and 3 not in types:
-                current = int(''.join(str(i) for i in tokens))
-                break
-            else:
-                current += number
-    elif 3 not in types and 2 not in types:
-        current = int(''.join(str(i) for i in tokens))
-    else:
-        """
-        double_list = []
-        current_double = []
-        double_type_list = []
-        for i in tokens:
-            if len(current_double) < 2:
-                current_double.append(i)
-            else:
-                double_list.append(current_double)
-                current_double = []
-        current_double = []
-        for i in types:
-            if len(current_double) < 2:
-                current_double.append(i)
-            else:
-                double_type_list.append(current_double)
-                current_double = []
-        print(double_type_list)
-        print(double_list)
-        current = 0
-        for i, type_double in enumerate(double_type_list):
-            if len(type_double) == 1:
-                current += double_list[i][0]
-            elif type_double[0] == type_double[1]:
-                current += int(str(double_list[i][0]) + str(double_list[i][1]))
-            elif type_double[0] > type_double[1]:
-                current += sum(double_list[i])
-            elif type_double[0] < type_double[1]:
-                current += double_list[i][0] * double_list[i][1]
-        #print(current)
-        """
-        count = 0
-        current = 0
-        for i, token in enumerate(tokens):
-            count += 1
-            if count == 2:
-                if types[i - 1] == types[i]:
-                    current += int(str(token) + str(tokens[i - 1]))
-                elif types[i - 1] > types[i]:
-                    current += tokens[i - 1] + token
-                else:
-                    current += tokens[i - 1] * token
-                count = 0
-            elif i == len(tokens) - 1:
-                current += token
-    return current
-def text2int(text):
-    # Wraps all of the functions up into one
-    return tokens2int(tokenize(text))

pyproject.toml ADDED Viewed

	@@ -0,0 +1,43 @@

+[tool.poetry]
+name = "MathText_FastAPI"
+version = "0.0.1"
+authors = [
+  "Sebastian Larsen <sebastianlarson22@gmail.com>",
+  "Çetin ÇAKIR <cetincakirtr@gmail.com>",
+  "Hobson Lane <gitlab@totalgood.com>",
+  ]
+description = "Natural Language Understanding (text processing) for math symbols, digits, and words with a Gradio user interface and REST API."
+readme = "README.md"
+# requires-python = ">=3.8"
+license = "AGPL-3.0-or-later"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
+    "Operating System :: OS Independent",
+]
+[tool.poetry.dependencies]
+mathtext = {git = "https://gitlab.com/tangibleai/community/mathtext", rev = "main"}
+fastapi = "0.74.*"
+pydantic = "*"
+python = "^3.8,<3.10"
+requests = "2.27.*"
+sentencepiece = "0.1.*"
+supabase = "*"
+uvicorn = "0.17.*"
+[tool.poetry.group.dev.dependencies]
+pytest = "^7.2"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+# [build-system]
+# requires = ["hatchling"]
+# build-backend = "hatchling.build"
+# repository = "https://gitlab.com/tangibleai/community/mathtext-fastapi"

requirements.txt CHANGED Viewed

@@ -1,16 +1,7 @@
 fastapi==0.74.*
 requests==2.27.*
 sentencepiece==0.1.*
-torch==1.12.*
-transformers==4.24.*
-uvicorn[standard]==0.17.*
-pydantic
-mathtext @ git+https://gitlab.com/tangibleai/community/mathtext@main
-spacy==3.4.*
-pandas==1.5.*
-matplotlib==3.6.*
-pytest==7.2.*
-httpx==0.23.*
-django-environ
 supabase

+mathtext @ git+https://gitlab.com/tangibleai/community/mathtext@main
 fastapi==0.74.*
+pydantic==1.10.*
 requests==2.27.*
 sentencepiece==0.1.*
 supabase
+uvicorn==0.17.*

tests/test_text2int.py CHANGED Viewed

@@ -1,11 +1,16 @@
 import unittest
 import pandas as pd
 from fastapi.testclient import TestClient
 from app import app
-TEST_DATA_FILE = "data/master_test_text2int.csv"
 client = TestClient(app)
@@ -15,6 +20,7 @@ class TestStringMethods(unittest.TestCase):
     def setUp(self):
         """Creates a fastapi test client"""
         self.client = TestClient(app)
     def get_response_text2int(self, text):
         """Makes a post request to the endpoint"""
@@ -35,15 +41,14 @@ class TestStringMethods(unittest.TestCase):
     def test_acc_score_text2int(self):
         """Calculates accuracy score for endpoint"""
-        df = pd.read_csv(TEST_DATA_FILE)
-        df["text2int"] = df["input"].apply(func=self.get_response_text2int)
-        df["score"] = df[["output", "text2int"]].apply(
             lambda row: row[0] == row[1],
             axis=1
         )
-        df.to_csv("data/text2int_results.csv", index=False)
-        acc_score = df["score"].mean().__round__(2)
         self.assertGreaterEqual(acc_score, 0.5, f"Accuracy score: '{acc_score}'. Value is too low!")

 import unittest
+from pathlib import Path
 import pandas as pd
 from fastapi.testclient import TestClient
 from app import app
+# The raw file URL has to be used for GitLab.
+URL = "https://gitlab.com/tangibleai/community/mathtext/-/raw/main/mathtext/data/master_test_text2int.csv"
+DATA_DIR = Path(__file__).parent.parent / "mathtext_fastapi" / "data"
+print(DATA_DIR)
 client = TestClient(app)
     def setUp(self):
         """Creates a fastapi test client"""
         self.client = TestClient(app)
+        self.df = pd.read_csv(URL)
     def get_response_text2int(self, text):
         """Makes a post request to the endpoint"""
     def test_acc_score_text2int(self):
         """Calculates accuracy score for endpoint"""
+        self.df["text2int"] = self.df["input"].apply(func=self.get_response_text2int)
+        self.df["score"] = self.df[["output", "text2int"]].apply(
             lambda row: row[0] == row[1],
             axis=1
         )
+        self.df.to_csv(f"{DATA_DIR}/text2int_results.csv", index=False)
+        acc_score = self.df["score"].mean().__round__(2)
         self.assertGreaterEqual(acc_score, 0.5, f"Accuracy score: '{acc_score}'. Value is too low!")