taners mertcobanov commited on
Commit
3ec8945
·
0 Parent(s):

Duplicate from deprem-ml/deprem-ocr

Browse files

Co-authored-by: Mert Cobanov <mertcobanov@users.noreply.huggingface.co>

Files changed (7) hide show
  1. .gitignore +162 -0
  2. README.md +13 -0
  3. app.py +174 -0
  4. db_utils.py +41 -0
  5. openai_api.py +31 -0
  6. requirements.txt +5 -0
  7. utils.py +53 -0
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ .DS_Store
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Deprem OCR
3
+ emoji: 👀
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.17.0
8
+ app_file: app.py
9
+ pinned: true
10
+ duplicated_from: deprem-ml/deprem-ocr
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import ImageFilter, Image
2
+ from easyocr import Reader
3
+ import gradio as gr
4
+ import numpy as np
5
+ import openai
6
+ import ast
7
+ import os
8
+
9
+ from openai_api import OpenAI_API
10
+ import utils
11
+
12
+ openai.api_key = os.getenv("API_KEY")
13
+ reader = Reader(["tr"])
14
+
15
+
16
+ def get_text(input_img):
17
+ img = Image.fromarray(input_img)
18
+ detailed = np.asarray(img.filter(ImageFilter.DETAIL))
19
+ result = reader.readtext(detailed, detail=0, paragraph=True)
20
+ return " ".join(result)
21
+
22
+
23
+ # Submit button
24
+ def get_parsed_address(input_img):
25
+
26
+ address_full_text = get_text(input_img)
27
+ return openai_response(address_full_text)
28
+
29
+
30
+ def save_deta_db(input):
31
+ eval_result = ast.literal_eval(input)
32
+ utils.write_db(eval_result)
33
+ return
34
+
35
+
36
+ def update_component():
37
+ return gr.update(value="Gönderildi, teşekkürler.", visible=True)
38
+
39
+
40
+ def clear_textbox(value):
41
+ return gr.update(value="")
42
+
43
+
44
+ # Open API on change
45
+ def text_dict(input):
46
+ eval_result = ast.literal_eval(input)
47
+ return (
48
+ str(eval_result["city"]),
49
+ str(eval_result["distinct"]),
50
+ str(eval_result["neighbourhood"]),
51
+ str(eval_result["street"]),
52
+ str(eval_result["address"]),
53
+ str(eval_result["tel"]),
54
+ str(eval_result["name_surname"]),
55
+ str(eval_result["no"]),
56
+ )
57
+
58
+
59
+ def openai_response(ocr_input):
60
+ prompt = f"""Tabular Data Extraction You are a highly intelligent and accurate tabular data extractor from
61
+ plain text input and especially from emergency text that carries address information, your inputs can be text
62
+ of arbitrary size, but the output should be in [{{'tabular': {{'entity_type': 'entity'}} }}] JSON format Force it
63
+ to only extract keys that are shared as an example in the examples section, if a key value is not found in the
64
+ text input, then it should be ignored. Have only city, distinct, neighbourhood,
65
+ street, no, tel, name_surname, address Examples:
66
+
67
+ Input: Deprem sırasında evimizde yer alan adresimiz: İstanbul, Beşiktaş, Yıldız Mahallesi, Cumhuriyet Caddesi No: 35, cep telefonu numaram 5551231256, adim Ahmet Yilmaz
68
+ Output: {{'city': 'İstanbul', 'distinct': 'Beşiktaş', 'neighbourhood': 'Yıldız Mahallesi', 'street': 'Cumhuriyet Caddesi', 'no': '35', 'tel': '5551231256', 'name_surname': 'Ahmet Yılmaz', 'address': 'İstanbul, Beşiktaş, Yıldız Mahallesi, Cumhuriyet Caddesi No: 35'}}
69
+
70
+ Input: 5.29 PMO $ 0 87 DEVREMİZ ÖZGÜR ORÇAN ARKADAŞIMIZA ULAŞAMIYORUZ BEYOĞLU MAH FEVZİ ÇAKMAK CAD. NO.58-TÜRKOĞLUI KAHRAMANMARAŞ 5524357578 AdReSe YaKIN OLANLAR VEYA ULASANLAR LÜTFEN BiLGILENDIRSIN .
71
+ Output: {{'city': 'Kahramanmaraş', 'distinct': 'Türkoğlu', 'neighbourhood': 'Beyoğlu Mahallesi', 'street': 'Çakmak Caddesi', 'no': '58', 'tel': '5524357578', 'name_surname': 'Özgür Orçan', 'address': 'Beyoğlu Mahallesi, Çakmak Caddesi, No:58 Türkoğlu/Kahramanmaraş'}}
72
+
73
+ Input: Ahmet @ozknhmt Ekim 2021 tarihinde katıldı - 2 Takipçi Takip ettiğin kimse takip etmiyor AKEVLER MAH. 432SK RÜYA APT ANT(BEDİİ SABUNCU KARŞISI) ANTAKYA HATAY MERVE BELANLI ses veriyor ancak hiçbiryardım ekibi olmadığı için kurtaramryoruz içeri girip, lütfen acil yardım_ İsim: Merve Belanlı tel 542 757 5484 Ö0 12.07
74
+ Output: {{'city': 'Hatay', 'distinct': 'Antakya', 'neighbourhood': 'Akevler Mahallesi', 'street': '432 Sokak', 'no': '', 'tel': '5427575484', 'name_surname': 'Merve Belanlı', 'address': 'Akevler Mahallesi, 432 Sokak, Rüya Apt. Antakya/Hatay'}}
75
+
76
+ Input: 14:04 Sümerler Cemil Şükrü Çolokoğlu ilköğretim okulu karşısı 3 9öçük altında yardım bekyouk Lütfen herkes paylogsın
77
+ Output: {{'city': '', 'distinct': '', 'neighbourhood': 'Sümerler Mahallesi', 'street': 'Cemil Şükrü Çolokoğlu İlköğretim Okulu Karşısı', 'no': '', 'tel': '', 'name_surname': '', 'address': 'Sümerler Mahallesi, Cemil Şükrü Çolokoğlu İlköğretim Okulu Karşısı'}}
78
+
79
+ Input: {ocr_input}
80
+ Output:
81
+ """
82
+
83
+ openai_client = OpenAI_API()
84
+ response = openai_client.single_request(prompt)
85
+ resp = response["choices"][0]["text"]
86
+ print(resp)
87
+ resp = eval(resp.replace("'{", "{").replace("}'", "}"))
88
+ resp["input"] = ocr_input
89
+ dict_keys = [
90
+ "city",
91
+ "distinct",
92
+ "neighbourhood",
93
+ "street",
94
+ "no",
95
+ "tel",
96
+ "name_surname",
97
+ "address",
98
+ "input",
99
+ ]
100
+ for key in dict_keys:
101
+ if key not in resp.keys():
102
+ resp[key] = ""
103
+ return resp
104
+
105
+
106
+ # User Interface
107
+ with gr.Blocks() as demo:
108
+ gr.Markdown(
109
+ """
110
+ # Enkaz Bildirme Uygulaması
111
+ """
112
+ )
113
+ gr.Markdown(
114
+ "Bu uygulamada ekran görüntüsü sürükleyip bırakarak AFAD'a enkaz bildirimi yapabilirsiniz. Mesajı metin olarak da girebilirsiniz, tam adresi ayrıştırıp döndürür. API olarak kullanmak isterseniz sayfanın en altında use via api'ya tıklayın."
115
+ )
116
+ with gr.Row():
117
+ with gr.Column():
118
+ img_area = gr.Image(label="Ekran Görüntüsü yükleyin 👇")
119
+ img_area_button = gr.Button(value="Görüntüyü İşle", label="Submit")
120
+
121
+ with gr.Column():
122
+ text_area = gr.Textbox(label="Metin yükleyin 👇 ", lines=8)
123
+ text_area_button = gr.Button(value="Metni Yükle", label="Submit")
124
+
125
+ open_api_text = gr.Textbox(label="Tam Adres")
126
+
127
+ with gr.Column():
128
+ with gr.Row():
129
+ city = gr.Textbox(label="İl", interactive=True, show_progress=False)
130
+ distinct = gr.Textbox(label="İlçe", interactive=True, show_progress=False)
131
+ with gr.Row():
132
+ neighbourhood = gr.Textbox(
133
+ label="Mahalle", interactive=True, show_progress=False
134
+ )
135
+ street = gr.Textbox(
136
+ label="Sokak/Cadde/Bulvar", interactive=True, show_progress=False
137
+ )
138
+ with gr.Row():
139
+ tel = gr.Textbox(label="Telefon", interactive=True, show_progress=False)
140
+ with gr.Row():
141
+ name_surname = gr.Textbox(
142
+ label="İsim Soyisim", interactive=True, show_progress=False
143
+ )
144
+ address = gr.Textbox(label="Adres", interactive=True, show_progress=False)
145
+ with gr.Row():
146
+ no = gr.Textbox(label="Kapı No", interactive=True, show_progress=False)
147
+
148
+ img_area_button.click(
149
+ get_parsed_address,
150
+ inputs=img_area,
151
+ outputs=open_api_text,
152
+ api_name="upload-image",
153
+ )
154
+
155
+ text_area_button.click(
156
+ openai_response, text_area, open_api_text, api_name="upload-text"
157
+ )
158
+
159
+ open_api_text.change(
160
+ text_dict,
161
+ open_api_text,
162
+ [city, distinct, neighbourhood, street, address, tel, name_surname, no],
163
+ )
164
+
165
+ submit_button = gr.Button(value="Veriyi Birimlere Yolla")
166
+ submit_button.click(save_deta_db, open_api_text)
167
+ done_text = gr.Textbox(label="Done", value="Not Done", visible=False)
168
+ submit_button.click(update_component, outputs=done_text)
169
+ for txt in [city, distinct, neighbourhood, street, address, tel, name_surname, no]:
170
+ submit_button.click(fn=clear_textbox, inputs=txt, outputs=txt)
171
+
172
+
173
+ if __name__ == "__main__":
174
+ demo.launch()
db_utils.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from deta import Deta # Import Deta
2
+ from pprint import pprint
3
+ import os
4
+
5
+ deta_key = os.getenv("DETA_KEY")
6
+ deta = Deta(deta_key)
7
+ db = deta.Base("deprem-ocr")
8
+
9
+
10
+ def get_users_by_city(city_name, limit=10):
11
+
12
+ user = db.fetch({"city": city_name.capitalize()}, limit=limit).items
13
+ return user
14
+
15
+
16
+ def get_all():
17
+ res = db.fetch()
18
+ all_items = res.items
19
+
20
+ # fetch until last is 'None'
21
+ while res.last:
22
+ res = db.fetch(last=res.last)
23
+ all_items += res.items
24
+ return all_items
25
+
26
+
27
+ def write_db(data_dict):
28
+ # 2) initialize with a project key
29
+ deta_key = os.getenv("DETA_KEY")
30
+ deta = Deta(deta_key)
31
+
32
+ # 3) create and use as many DBs as you want!
33
+ users = deta.Base("deprem-ocr")
34
+ users.insert(data_dict)
35
+ print("Pushed to db")
36
+
37
+
38
+ def get_latest_row(last):
39
+ all_items = get_all()
40
+ latest_items = all_items[-last:]
41
+ return latest_items
openai_api.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+
4
+
5
+ class OpenAI_API:
6
+ def __init__(self):
7
+ self.openai_api_key = ""
8
+
9
+ def single_request(self, address_text):
10
+
11
+ openai.api_type = "azure"
12
+ openai.api_base = "https://damlaopenai.openai.azure.com/"
13
+ openai.api_version = "2022-12-01"
14
+ openai.api_key = os.getenv("API_KEY")
15
+
16
+ response = openai.Completion.create(
17
+ engine="Davinci-003",
18
+ prompt=address_text,
19
+ temperature=0.0,
20
+ max_tokens=500,
21
+ top_p=1,
22
+ # n=1,
23
+ # logprobs=0,
24
+ # echo=False,
25
+ stop=["\n"],
26
+ frequency_penalty=0,
27
+ presence_penalty=0,
28
+ # best_of=1,
29
+ )
30
+
31
+ return response
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ openai
2
+ Pillow
3
+ easyocr
4
+ gradio
5
+ deta
utils.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import csv
3
+ import json
4
+ from deta import Deta
5
+ import os
6
+ import requests
7
+
8
+
9
+ def preprocess_img(inp_image):
10
+ gray = cv2.cvtColor(inp_image, cv2.COLOR_BGR2GRAY)
11
+ gray_img = cv2.bitwise_not(gray)
12
+ return gray_img
13
+
14
+
15
+ def save_csv(mahalle, il, sokak, apartman):
16
+ adres_full = [mahalle, il, sokak, apartman]
17
+
18
+ with open("adress_book.csv", "a", encoding="utf-8") as f:
19
+ write = csv.writer(f)
20
+ write.writerow(adres_full)
21
+ return adres_full
22
+
23
+
24
+ def get_json(mahalle, il, sokak, apartman):
25
+ adres = {"mahalle": mahalle, "il": il, "sokak": sokak, "apartman": apartman}
26
+ dump = json.dumps(adres, indent=4, ensure_ascii=False)
27
+ return dump
28
+
29
+
30
+ def write_db(data_dict):
31
+ # 2) initialize with a project key
32
+ deta_key = os.getenv("DETA_KEY")
33
+ deta = Deta(deta_key)
34
+
35
+ # 3) create and use as many DBs as you want!
36
+ users = deta.Base("deprem-ocr")
37
+ users.insert(data_dict)
38
+
39
+
40
+ def ner_response(ocr_input):
41
+ API_URL = "https://api-inference.huggingface.co/models/deprem-ml/deprem-ner"
42
+ headers = {"Authorization": "Bearer xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
43
+
44
+ def query(payload):
45
+ response = requests.post(API_URL, headers=headers, json=payload)
46
+ return response.json()
47
+
48
+ output = query(
49
+ {
50
+ "inputs": ocr_input,
51
+ }
52
+ )
53
+ return output