Spaces:
Sleeping
Sleeping
Add new modules and update file paths
Browse files- .gitignore +1 -2
- src/app/api/module/audio_text.py +49 -0
- src/app/api/module/config.py +10 -0
- src/app/api/module/image.ipynb +220 -0
- src/app/api/module/image_enhance.py +111 -0
- src/app/api/module/llm_vision.py +58 -0
- src/app/api/module/ocr.py +31 -0
- src/app/api/module/product_description.py +26 -0
- src/app/api/module/prompts/base.py +35 -0
- src/app/api/module/utils.py +44 -0
- src/app/api/module/vectorsearch.py +48 -0
.gitignore
CHANGED
@@ -158,6 +158,5 @@ cython_debug/
|
|
158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
#.idea/
|
161 |
-
src/module/data/*
|
162 |
data/*
|
163 |
-
app
|
|
|
158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
#.idea/
|
161 |
+
src/app/api/module/data/*
|
162 |
data/*
|
|
src/app/api/module/audio_text.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from whisper_jax import FlaxWhisperPipline
|
2 |
+
# import jax.numpy as jnp
|
3 |
+
import whisper
|
4 |
+
print(whisper.__file__)
|
5 |
+
from openai import OpenAI
|
6 |
+
from module.config import OPENAI_API_KEY
|
7 |
+
import os
|
8 |
+
|
9 |
+
client = OpenAI()
|
10 |
+
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
|
11 |
+
|
12 |
+
|
13 |
+
# def whisper_pipeline_tpu(audio):
|
14 |
+
# pipeline = FlaxWhisperPipline("openai/whisper-large-v3", dtype=jnp.bfloat16, batch_size=16)
|
15 |
+
# text = pipeline(audio)
|
16 |
+
# return text
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
def whisper_pipeline(audio_path):
|
21 |
+
model = whisper.load_model("medium")
|
22 |
+
# load audio and pad/trim it to fit 30 seconds
|
23 |
+
audio = whisper.load_audio(audio_path)
|
24 |
+
audio = whisper.pad_or_trim(audio)
|
25 |
+
# make log-Mel spectrogram and move to the same device as the model
|
26 |
+
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
27 |
+
# detect the spoken language
|
28 |
+
_, probs = model.detect_language(mel)
|
29 |
+
print(f"Detected language: {max(probs, key=probs.get)}")
|
30 |
+
# decode the audio
|
31 |
+
options = whisper.DecodingOptions()
|
32 |
+
result = whisper.decode(model, mel, options)
|
33 |
+
# print the recognized text
|
34 |
+
print(result.text)
|
35 |
+
return result.text
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
def whisper_openai(audio_path):
|
42 |
+
audio_file= open(audio_path, "rb")
|
43 |
+
transcript = client.audio.transcriptions.create(
|
44 |
+
model="whisper-1",
|
45 |
+
file=audio_file
|
46 |
+
)
|
47 |
+
return transcript
|
48 |
+
|
49 |
+
whisper_pipeline()
|
src/app/api/module/config.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from decouple import config
|
2 |
+
import os
|
3 |
+
|
4 |
+
OPENAI_API_KEY = config('OPENAI_API_KEY', default="")
|
5 |
+
key = config("AZURE")
|
6 |
+
emmbedding_model = "text-embedding-3-large"
|
7 |
+
|
8 |
+
file_Directory= os.path.join(os.getcwd(), "data")
|
9 |
+
|
10 |
+
endpoint = "https://bintix-ocr.cognitiveservices.azure.com/"
|
src/app/api/module/image.ipynb
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"ename": "",
|
10 |
+
"evalue": "",
|
11 |
+
"output_type": "error",
|
12 |
+
"traceback": [
|
13 |
+
"\u001b[1;31mRunning cells with 'catlognew' requires the ipykernel package.\n",
|
14 |
+
"\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
|
15 |
+
"\u001b[1;31mCommand: 'conda install -n catlognew ipykernel --update-deps --force-reinstall'"
|
16 |
+
]
|
17 |
+
}
|
18 |
+
],
|
19 |
+
"source": [
|
20 |
+
"import cv2\n",
|
21 |
+
"import os\n",
|
22 |
+
"import numpy as np \n",
|
23 |
+
"from llm_vision import OpenAIVision\n",
|
24 |
+
"from ocr import azure_ocr\n",
|
25 |
+
"from prompts.base import base_prompt\n",
|
26 |
+
"from utils import extract_json_from_text\n",
|
27 |
+
"from vectorsearch import search , get_detail_df"
|
28 |
+
]
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"cell_type": "code",
|
32 |
+
"execution_count": null,
|
33 |
+
"metadata": {},
|
34 |
+
"outputs": [
|
35 |
+
{
|
36 |
+
"ename": "",
|
37 |
+
"evalue": "",
|
38 |
+
"output_type": "error",
|
39 |
+
"traceback": [
|
40 |
+
"\u001b[1;31mFailed to start the Kernel. \n",
|
41 |
+
"\u001b[1;31mUnable to start Kernel 'catlognew (Python)' due to a connection timeout. \n",
|
42 |
+
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
43 |
+
]
|
44 |
+
}
|
45 |
+
],
|
46 |
+
"source": [
|
47 |
+
"image_path = r\"data/remove_flash.jpg\""
|
48 |
+
]
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"cell_type": "code",
|
52 |
+
"execution_count": 3,
|
53 |
+
"metadata": {},
|
54 |
+
"outputs": [],
|
55 |
+
"source": [
|
56 |
+
"details = azure_ocr(image_path)"
|
57 |
+
]
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"cell_type": "code",
|
61 |
+
"execution_count": 4,
|
62 |
+
"metadata": {},
|
63 |
+
"outputs": [
|
64 |
+
{
|
65 |
+
"data": {
|
66 |
+
"text/plain": [
|
67 |
+
"'BRU\\nNOW 90/- ONLY'"
|
68 |
+
]
|
69 |
+
},
|
70 |
+
"execution_count": 4,
|
71 |
+
"metadata": {},
|
72 |
+
"output_type": "execute_result"
|
73 |
+
}
|
74 |
+
],
|
75 |
+
"source": [
|
76 |
+
"details"
|
77 |
+
]
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"cell_type": "code",
|
81 |
+
"execution_count": 5,
|
82 |
+
"metadata": {},
|
83 |
+
"outputs": [],
|
84 |
+
"source": [
|
85 |
+
"prompt = base_prompt.format(text = details)"
|
86 |
+
]
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"cell_type": "code",
|
90 |
+
"execution_count": 6,
|
91 |
+
"metadata": {},
|
92 |
+
"outputs": [],
|
93 |
+
"source": [
|
94 |
+
"obj = OpenAIVision()\n",
|
95 |
+
"json = obj.get_image_description(image_path,prompt)"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "code",
|
100 |
+
"execution_count": 7,
|
101 |
+
"metadata": {},
|
102 |
+
"outputs": [
|
103 |
+
{
|
104 |
+
"data": {
|
105 |
+
"text/plain": [
|
106 |
+
"'```json\\n{\\n \"brand\": \"BRU\",\\n \"mrp\": \"90/-\",\\n \"unit\": \"null\",\\n \"Quantity\": 1,\\n \"parent_category\": \"BEVERAGES\",\\n \"ingredients\": \"null\",\\n \"calorie_count\": \"null\",\\n \"marketed_by\": \"null\",\\n \"manufactured_by\": \"null\",\\n \"manufactured_in_country\": \"null\",\\n \"type_of_packaging\": \"null\",\\n \"promotion_on_the_pack\": \"NEW 90/- ONLY\",\\n \"type_of_product\": \"Instant Coffee\",\\n \"pack_of_or_no_of_units\": \"null\"\\n}\\n```'"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
"execution_count": 7,
|
110 |
+
"metadata": {},
|
111 |
+
"output_type": "execute_result"
|
112 |
+
}
|
113 |
+
],
|
114 |
+
"source": [
|
115 |
+
"json['choices'][0]['message']['content']"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"cell_type": "code",
|
120 |
+
"execution_count": 8,
|
121 |
+
"metadata": {},
|
122 |
+
"outputs": [],
|
123 |
+
"source": [
|
124 |
+
"response = extract_json_from_text(json['choices'][0]['message']['content'])"
|
125 |
+
]
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"cell_type": "code",
|
129 |
+
"execution_count": 9,
|
130 |
+
"metadata": {},
|
131 |
+
"outputs": [
|
132 |
+
{
|
133 |
+
"name": "stdout",
|
134 |
+
"output_type": "stream",
|
135 |
+
"text": [
|
136 |
+
"{'brand': 'BRU', 'mrp': '90/-', 'unit': 'null', 'Quantity': 1, 'parent_category': 'BEVERAGES', 'ingredients': 'null', 'calorie_count': 'null', 'marketed_by': 'null', 'manufactured_by': 'null', 'manufactured_in_country': 'null', 'type_of_packaging': 'null', 'promotion_on_the_pack': 'NEW 90/- ONLY', 'type_of_product': 'Instant Coffee', 'pack_of_or_no_of_units': 'null'}\n"
|
137 |
+
]
|
138 |
+
}
|
139 |
+
],
|
140 |
+
"source": [
|
141 |
+
"print(response)"
|
142 |
+
]
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"cell_type": "code",
|
146 |
+
"execution_count": 10,
|
147 |
+
"metadata": {},
|
148 |
+
"outputs": [
|
149 |
+
{
|
150 |
+
"ename": "ImportError",
|
151 |
+
"evalue": "Could not import chromadb python package. Please install it with `pip install chromadb`.",
|
152 |
+
"output_type": "error",
|
153 |
+
"traceback": [
|
154 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
155 |
+
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
|
156 |
+
"File \u001b[0;32m~/miniconda3/envs/catlog/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:81\u001b[0m, in \u001b[0;36mChroma.__init__\u001b[0;34m(self, collection_name, embedding_function, persist_directory, client_settings, collection_metadata, client, relevance_score_fn)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mchromadb\u001b[39;00m\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mchromadb\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig\u001b[39;00m\n",
|
157 |
+
"File \u001b[0;32m~/miniconda3/envs/catlog/lib/python3.10/site-packages/chromadb/__init__.py:5\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mchromadb\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mclient\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AdminClient \u001b[38;5;28;01mas\u001b[39;00m AdminClientCreator\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mchromadb\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mauth\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtoken\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TokenTransportHeader\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mchromadb\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig\u001b[39;00m\n",
|
158 |
+
"File \u001b[0;32m~/miniconda3/envs/catlog/lib/python3.10/site-packages/chromadb/auth/token/__init__.py:26\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mchromadb\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m System\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mchromadb\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtelemetry\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mopentelemetry\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 27\u001b[0m OpenTelemetryGranularity,\n\u001b[1;32m 28\u001b[0m trace_method,\n\u001b[1;32m 29\u001b[0m )\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mchromadb\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_class\n",
|
159 |
+
"File \u001b[0;32m~/miniconda3/envs/catlog/lib/python3.10/site-packages/chromadb/telemetry/opentelemetry/__init__.py:5\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Any, Callable, Dict, Optional, Sequence, Union\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopentelemetry\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m trace\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopentelemetry\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msdk\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mresources\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SERVICE_NAME, Resource\n",
|
160 |
+
"File \u001b[0;32m~/miniconda3/envs/catlog/lib/python3.10/site-packages/opentelemetry/trace/__init__.py:87\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdeprecated\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deprecated\n\u001b[0;32m---> 87\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopentelemetry\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m context \u001b[38;5;28;01mas\u001b[39;00m context_api\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopentelemetry\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mattributes\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BoundedAttributes \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n",
|
161 |
+
"File \u001b[0;32m~/miniconda3/envs/catlog/lib/python3.10/site-packages/opentelemetry/context/__init__.py:25\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopentelemetry\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01menvironment_variables\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m OTEL_PYTHON_CONTEXT\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopentelemetry\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutil\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_importlib_metadata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m entry_points\n\u001b[1;32m 27\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mgetLogger(\u001b[38;5;18m__name__\u001b[39m)\n",
|
162 |
+
"File \u001b[0;32m~/miniconda3/envs/catlog/lib/python3.10/site-packages/opentelemetry/util/_importlib_metadata.py:17\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright The OpenTelemetry Authors\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# FIXME: Use importlib.metadata when support for 3.11 is dropped if the rest of\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# the supported versions at that time have the same API.\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mimportlib_metadata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ( \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 18\u001b[0m EntryPoint,\n\u001b[1;32m 19\u001b[0m EntryPoints,\n\u001b[1;32m 20\u001b[0m entry_points,\n\u001b[1;32m 21\u001b[0m version,\n\u001b[1;32m 22\u001b[0m )\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# The importlib-metadata library has introduced breaking changes before to its\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m# API, this module is kept just to act as a layer between the\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m# importlib-metadata library and our project if in any case it is necessary to\u001b[39;00m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# do so.\u001b[39;00m\n",
|
163 |
+
"\u001b[0;31mImportError\u001b[0m: cannot import name 'EntryPoint' from 'importlib_metadata' (unknown location)",
|
164 |
+
"\nDuring handling of the above exception, another exception occurred:\n",
|
165 |
+
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
|
166 |
+
"Cell \u001b[0;32mIn[10], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m name \u001b[38;5;241m=\u001b[39m response[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbrand\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m response[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtype_of_product\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 2\u001b[0m name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBRU\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 3\u001b[0m get_prod_name_db \u001b[38;5;241m=\u001b[39m \u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n",
|
167 |
+
"File \u001b[0;32m~/Catalog-Digitization-/src/app/api/module/vectorsearch.py:30\u001b[0m, in \u001b[0;36msearch\u001b[0;34m(query)\u001b[0m\n\u001b[1;32m 28\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m OpenAIEmbeddings()\n\u001b[1;32m 29\u001b[0m db_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(file_Directory,\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvectorstore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 30\u001b[0m db \u001b[38;5;241m=\u001b[39m \u001b[43mChroma\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpersist_directory\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mdb_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43membedding_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43membeddings\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 31\u001b[0m embedding_vector \u001b[38;5;241m=\u001b[39m OpenAIEmbeddings()\u001b[38;5;241m.\u001b[39membed_query(query)\n\u001b[1;32m 32\u001b[0m docs \u001b[38;5;241m=\u001b[39m db\u001b[38;5;241m.\u001b[39msimilarity_search_by_vector(embedding_vector)\n",
|
168 |
+
"File \u001b[0;32m~/miniconda3/envs/catlog/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:84\u001b[0m, in \u001b[0;36mChroma.__init__\u001b[0;34m(self, collection_name, embedding_function, persist_directory, client_settings, collection_metadata, client, relevance_score_fn)\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mchromadb\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig\u001b[39;00m\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m---> 84\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[1;32m 85\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not import chromadb python package. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 86\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease install it with `pip install chromadb`.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 87\u001b[0m )\n\u001b[1;32m 89\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m client \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 90\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client_settings \u001b[38;5;241m=\u001b[39m client_settings\n",
|
169 |
+
"\u001b[0;31mImportError\u001b[0m: Could not import chromadb python package. Please install it with `pip install chromadb`."
|
170 |
+
]
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"ename": "",
|
174 |
+
"evalue": "",
|
175 |
+
"output_type": "error",
|
176 |
+
"traceback": [
|
177 |
+
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
|
178 |
+
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
|
179 |
+
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
|
180 |
+
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
181 |
+
]
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"source": [
|
185 |
+
"\n",
|
186 |
+
"name = response['brand'] + \" \" + response['type_of_product']\n",
|
187 |
+
"name = \"BRU\"\n",
|
188 |
+
"get_prod_name_db = search(name)"
|
189 |
+
]
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"cell_type": "code",
|
193 |
+
"execution_count": null,
|
194 |
+
"metadata": {},
|
195 |
+
"outputs": [],
|
196 |
+
"source": []
|
197 |
+
}
|
198 |
+
],
|
199 |
+
"metadata": {
|
200 |
+
"kernelspec": {
|
201 |
+
"display_name": "catlog",
|
202 |
+
"language": "python",
|
203 |
+
"name": "python3"
|
204 |
+
},
|
205 |
+
"language_info": {
|
206 |
+
"codemirror_mode": {
|
207 |
+
"name": "ipython",
|
208 |
+
"version": 3
|
209 |
+
},
|
210 |
+
"file_extension": ".py",
|
211 |
+
"mimetype": "text/x-python",
|
212 |
+
"name": "python",
|
213 |
+
"nbconvert_exporter": "python",
|
214 |
+
"pygments_lexer": "ipython3",
|
215 |
+
"version": "3.10.0"
|
216 |
+
}
|
217 |
+
},
|
218 |
+
"nbformat": 4,
|
219 |
+
"nbformat_minor": 2
|
220 |
+
}
|
src/app/api/module/image_enhance.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import os
|
3 |
+
from config import file_Directory
|
4 |
+
import numpy as np
|
5 |
+
from PIL import Image
|
6 |
+
|
7 |
+
class Image_Enhance():
|
8 |
+
|
9 |
+
def __init__(self, image_path) -> None:
|
10 |
+
self.image_path = image_path
|
11 |
+
|
12 |
+
def brightness_Adjust(self):
|
13 |
+
# Load the image
|
14 |
+
image = cv2.imread(self.image_path)
|
15 |
+
#Plot the original image
|
16 |
+
alpha = -1.1
|
17 |
+
# control brightness by 50
|
18 |
+
beta = 70
|
19 |
+
image2 = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
|
20 |
+
#Save the image
|
21 |
+
# imagepth = os.path.join(os.path.dirname(self.image_path), 'Brightness & contrast.jpg')
|
22 |
+
imagepth = os.path.join(file_Directory, 'Brightness & contrast.jpg')
|
23 |
+
cv2.imwrite(imagepth, image2)
|
24 |
+
return imagepth
|
25 |
+
|
26 |
+
def remove_flash(self, imagepth):
|
27 |
+
image = cv2.imread(imagepth)
|
28 |
+
# cv2.cvtColor is applied over the
|
29 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
30 |
+
|
31 |
+
# Apply adaptive thresholding to segment the text
|
32 |
+
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 4)
|
33 |
+
|
34 |
+
# Apply Gaussian blur to the grayscale image to reduce noise
|
35 |
+
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
|
36 |
+
|
37 |
+
# Threshold the blurred image to create a binary mask for the flashlight glare
|
38 |
+
_, mask = cv2.threshold(blurred, 240, 255, cv2.THRESH_BINARY_INV)
|
39 |
+
|
40 |
+
# Combine the text and glare masks
|
41 |
+
mask = cv2.bitwise_or(mask, thresh)
|
42 |
+
|
43 |
+
# Apply morphological closing to further remove small areas of glare
|
44 |
+
kernel = np.ones((5,5),np.uint8)
|
45 |
+
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
|
46 |
+
|
47 |
+
# Apply the mask to the original image to remove flashlight glare
|
48 |
+
result = cv2.bitwise_and(image, image, mask=mask)
|
49 |
+
|
50 |
+
cv2.imwrite(os.path.join(file_Directory, 'remove_flash.jpg'), result)
|
51 |
+
|
52 |
+
def sharpen(self, imagepth):
|
53 |
+
image = cv2.imread(imagepth)
|
54 |
+
# Create the sharpening kernel
|
55 |
+
kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
|
56 |
+
# Sharpen the image
|
57 |
+
sharpened_image = cv2.filter2D(image, -1, kernel)
|
58 |
+
#Save the image
|
59 |
+
imagepath = os.path.join(file_Directory, 'sharpened_image.jpg')
|
60 |
+
cv2.imwrite(imagepath, sharpened_image)
|
61 |
+
return imagepath
|
62 |
+
|
63 |
+
|
64 |
+
def lapacian_sharpen(self, imagepth):
|
65 |
+
#Load the image
|
66 |
+
image = cv2.imread(imagepth)
|
67 |
+
|
68 |
+
# Sharpen the image using the Laplacian operator
|
69 |
+
sharpened_image2 = cv2.Laplacian(image, cv2.CV_64F)
|
70 |
+
imagepath = os.path.join(file_Directory, 'Laplacian_sharpened_image.jpg')
|
71 |
+
#Save the image
|
72 |
+
cv2.imwrite(imagepath, sharpened_image2)
|
73 |
+
|
74 |
+
def removing_noise(self, imagepth):
|
75 |
+
# Load the image
|
76 |
+
image = cv2.imread(imagepth)
|
77 |
+
# Remove noise using a median filter
|
78 |
+
filtered_image = cv2.medianBlur(image, 1)
|
79 |
+
imagepath = os.path.join(file_Directory, 'Median Blur.jpg')
|
80 |
+
#Save the image
|
81 |
+
cv2.imwrite(imagepath, filtered_image)
|
82 |
+
|
83 |
+
return imagepath
|
84 |
+
|
85 |
+
|
86 |
+
def enhance_color(self, imagepth):
|
87 |
+
# Load the image
|
88 |
+
image = cv2.imread(imagepth)
|
89 |
+
image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
|
90 |
+
|
91 |
+
# Adjust the hue, saturation, and value of the image
|
92 |
+
# Adjusts the hue by multiplying it by 0.7
|
93 |
+
image[:, :, 0] = image[:, :, 0] * 0.7
|
94 |
+
# Adjusts the saturation by multiplying it by 1.5
|
95 |
+
image[:, :, 1] = image[:, :, 1] * 1.5
|
96 |
+
# Adjusts the value by multiplying it by 0.5
|
97 |
+
image[:, :, 2] = image[:, :, 2] * 0.5
|
98 |
+
|
99 |
+
image2 = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
|
100 |
+
imagepath = os.path.join(file_Directory, 'enhanced coloured.jpg')
|
101 |
+
#Save the image
|
102 |
+
cv2.imwrite(imagepath, image2)
|
103 |
+
|
104 |
+
|
105 |
+
obj = Image_Enhance(r"data/Catalog Digitization/ONDC Test Data _ Images/Product Images/Bru_Instant_Coffee_Powder.png")
|
106 |
+
pth = obj.brightness_Adjust()
|
107 |
+
sharpen = obj.sharpen(pth)
|
108 |
+
lapacian_sharpen = obj.lapacian_sharpen(sharpen)
|
109 |
+
noise = obj.removing_noise(sharpen)
|
110 |
+
obj.enhance_color(noise)
|
111 |
+
obj.remove_flash(sharpen)
|
src/app/api/module/llm_vision.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import requests
|
3 |
+
from config import OPENAI_API_KEY
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
"""
|
8 |
+
openai_vision = OpenAIVision(api_key)
|
9 |
+
image_path = "path_to_your_image.jpg"
|
10 |
+
prompt = ""
|
11 |
+
response = openai_vision.get_image_description(prompt,image_path)
|
12 |
+
"""
|
13 |
+
|
14 |
+
class OpenAIVision:
|
15 |
+
def __init__(self):
|
16 |
+
self.api_key = OPENAI_API_KEY
|
17 |
+
self.base_url = "https://api.openai.com/v1/chat/completions"
|
18 |
+
|
19 |
+
def __encode_image(self, image_path):
|
20 |
+
with open(image_path, "rb") as image_file:
|
21 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
22 |
+
|
23 |
+
def get_image_description(self, image_path, prompt):
|
24 |
+
base64_image = self.__encode_image(image_path)
|
25 |
+
|
26 |
+
headers = {
|
27 |
+
"Content-Type": "application/json",
|
28 |
+
"Authorization": f"Bearer {self.api_key}"
|
29 |
+
}
|
30 |
+
|
31 |
+
payload = {
|
32 |
+
"model": "gpt-4-vision-preview",
|
33 |
+
"temperature": 0.0,
|
34 |
+
"messages": [
|
35 |
+
{
|
36 |
+
"role": "user",
|
37 |
+
"content": [
|
38 |
+
{
|
39 |
+
"type": "text",
|
40 |
+
"text": prompt,
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"type": "image_url",
|
44 |
+
"image_url": {
|
45 |
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
46 |
+
}
|
47 |
+
}
|
48 |
+
|
49 |
+
]
|
50 |
+
|
51 |
+
}
|
52 |
+
],
|
53 |
+
"max_tokens": 1000,
|
54 |
+
|
55 |
+
}
|
56 |
+
|
57 |
+
response = requests.post(self.base_url, headers=headers, json=payload)
|
58 |
+
return response.json()
|
src/app/api/module/ocr.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from azure.ai.formrecognizer import DocumentAnalysisClient
|
2 |
+
from azure.core.credentials import AzureKeyCredential
|
3 |
+
from config import key, endpoint
|
4 |
+
import easyocr
|
5 |
+
|
6 |
+
def azure_ocr(image_path):
|
7 |
+
try:
|
8 |
+
# Create a DocumentAnalysisClient instance
|
9 |
+
document_analysis_client = DocumentAnalysisClient(
|
10 |
+
endpoint=endpoint, credential=AzureKeyCredential(key)
|
11 |
+
)
|
12 |
+
|
13 |
+
# Open the image file and begin document analysis
|
14 |
+
with open(image_path, "rb") as image_file:
|
15 |
+
poller = document_analysis_client.begin_analyze_document(
|
16 |
+
"prebuilt-read", document=image_file
|
17 |
+
)
|
18 |
+
result = poller.result()
|
19 |
+
return result.content
|
20 |
+
except Exception as e:
|
21 |
+
print('Error occurred:', e)
|
22 |
+
return ""
|
23 |
+
|
24 |
+
def easy_ocr(image_path):
|
25 |
+
try:
|
26 |
+
reader = easyocr.Reader(['en','hi','bn','mr','ta','te'])
|
27 |
+
result = reader.readtext(image_path)
|
28 |
+
return result
|
29 |
+
except Exception as e:
|
30 |
+
print('Error occurred:', e)
|
31 |
+
return []
|
src/app/api/module/product_description.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import os
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import numpy as np
|
5 |
+
from llm_vision import OpenAIVision
|
6 |
+
from ocr import azure_ocr
|
7 |
+
from prompts.base import base_prompt
|
8 |
+
from utils import extract_json_from_text
|
9 |
+
from vectorsearch import search , get_detail_df
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
def get_product_description(image_path):
|
14 |
+
details = azure_ocr(image_path)
|
15 |
+
prompt = base_prompt.format(text = details)
|
16 |
+
obj = OpenAIVision()
|
17 |
+
json = obj.get_image_description(image_path,prompt)
|
18 |
+
response = extract_json_from_text(json['choices'][0]['message']['content'])
|
19 |
+
|
20 |
+
return response
|
21 |
+
|
22 |
+
def add_in_db(response):
|
23 |
+
name = response['brand'] + " " + response['type_of_product']
|
24 |
+
get_prod_name_db = search(name)
|
25 |
+
name = get_detail_df(get_prod_name_db)
|
26 |
+
### Add things into database
|
src/app/api/module/prompts/base.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from textwrap import dedent
|
2 |
+
|
3 |
+
base_prompt = dedent("""
|
4 |
+
### Instruction:
|
5 |
+
|
6 |
+
product description starts here
|
7 |
+
|
8 |
+
{text}
|
9 |
+
|
10 |
+
product description ends here
|
11 |
+
|
12 |
+
this is the categorys list ['BEVERAGES', 'SNACKS & BRANDED FOODS', 'NOT FOUND', 'EGGS, MEAT & FISH', 'FOODGRAINS, OIL & MASALA', 'PERSONAL CARE', 'CLEANING & HOUSEHOLD', 'FRUITS & VEGETABLES', 'BAKERY, CAKES & DAIRY', 'MAKEUP', 'BABY CARE', 'PET FOOD & ACCESSORIES', 'NON FMCG', 'ALCOHOL & TOBACCO', 'WELLNESS', 'EVERYDAY MEDICINE-NEW', 'EXCERCISE & FITNESS', 'ALCOHOLIC BEVERAGES'].
|
13 |
+
|
14 |
+
Get the text from the product image and the above product description to give me the following details in JSON format:
|
15 |
+
( return "null" where you don't have a answer)
|
16 |
+
|
17 |
+
"brand": "sample_brand",
|
18 |
+
"mrp": "The price might start with MRP or Rs.",
|
19 |
+
"unit": "per pack",
|
20 |
+
"Quantity": 1, ##num of products visible
|
21 |
+
"parent_category": "from the above given list",
|
22 |
+
"ingredients": ["ingredient1", "ingredient2", "ingredient3"],
|
23 |
+
"calorie_count": "Would be in numbers",
|
24 |
+
"marketed_by": "sample_marketer",
|
25 |
+
"manufactured_by": "sample_manufacturer",
|
26 |
+
"manufactured_in_country": "Country XYZ",
|
27 |
+
"type_of_packaging": "Box",
|
28 |
+
"promotion_on_the_pack": "if any",
|
29 |
+
"type_of_product": "give this your understanding",
|
30 |
+
"pack_of_or_no_of_units": "No. of Units"
|
31 |
+
|
32 |
+
|
33 |
+
Analyse data from the above product description to give me the following details in JSON format:
|
34 |
+
Only return the output in the required json format.
|
35 |
+
""")
|
src/app/api/module/utils.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
# with open('Category-tree.json') as f:
|
4 |
+
# cat_data = json.load(f)
|
5 |
+
|
6 |
+
candidate_labels = ['BEVERAGES', 'SNACKS & BRANDED FOODS', 'NOT FOUND', 'EGGS, MEAT & FISH',
|
7 |
+
'FOODGRAINS, OIL & MASALA', 'PERSONAL CARE', 'CLEANING & HOUSEHOLD',
|
8 |
+
'FRUITS & VEGETABLES', 'BAKERY, CAKES & DAIRY', 'MAKEUP', 'BABY CARE',
|
9 |
+
'PET FOOD & ACCESSORIES', 'NON FMCG', 'TOBACCO', 'WELLNESS', 'ALCOHOLIC BEVERAGES']
|
10 |
+
|
11 |
+
def get_childs(parent):
|
12 |
+
catagories = []
|
13 |
+
for category in cat_data:
|
14 |
+
if category['name'] == parent:
|
15 |
+
for child in category['children']:
|
16 |
+
catagories.append(child['name'])
|
17 |
+
return catagories
|
18 |
+
|
19 |
+
def get_inner_child(to_find_parent,to_find_child):
|
20 |
+
catagories = []
|
21 |
+
for parent in cat_data:
|
22 |
+
if parent['name'] == to_find_parent:
|
23 |
+
for child in parent['children']:
|
24 |
+
if child['name'] == to_find_child:
|
25 |
+
for inner_child in child['children']:
|
26 |
+
catagories.append(inner_child['name'])
|
27 |
+
return catagories
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
def extract_json_from_text(text):
|
32 |
+
text = str(text)
|
33 |
+
|
34 |
+
try:
|
35 |
+
# Find the JSON part within the text
|
36 |
+
start_index = text.find('{')
|
37 |
+
end_index = text.rfind('}') + 1
|
38 |
+
json_part = text[start_index:end_index]
|
39 |
+
json_part = json.loads(json_part.strip())
|
40 |
+
return json_part
|
41 |
+
|
42 |
+
except Exception as e:
|
43 |
+
print(f"\033[31m Exception occurred while loading JSON: {str(e)} [0m")
|
44 |
+
return e
|
src/app/api/module/vectorsearch.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from config import OPENAI_API_KEY, file_Directory
|
3 |
+
from langchain_community.document_loaders.csv_loader import CSVLoader
|
4 |
+
from langchain_openai import OpenAIEmbeddings
|
5 |
+
from langchain.text_splitter import CharacterTextSplitter
|
6 |
+
from langchain_community.vectorstores import Chroma
|
7 |
+
import pandas as pd
|
8 |
+
|
9 |
+
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
|
10 |
+
|
11 |
+
|
12 |
+
# df = pd.read_excel(r"/home/vrush/Catalog-Digitization-/src/module/data/Catalog Digitization/ONDC Test Data _ Images/ONDCSampleData.xlsx")
|
13 |
+
# df_new = pd.DataFrame(columns=["id", "name"])
|
14 |
+
# df_new = df['name']
|
15 |
+
# df_new.to_csv(r"data/data.csv", index=False)
|
16 |
+
|
17 |
+
def create_vector():
|
18 |
+
loader = CSVLoader(file_path="data/data.csv")
|
19 |
+
docs = loader.load()
|
20 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
21 |
+
documents = text_splitter.split_documents(docs)
|
22 |
+
db_path = os.path.join(file_Directory,"vectorstore")
|
23 |
+
embeddings = OpenAIEmbeddings()
|
24 |
+
os.makedirs(db_path, exist_ok=True)
|
25 |
+
Chroma.from_documents(docs, embeddings, persist_directory= db_path)
|
26 |
+
|
27 |
+
def search(query):
|
28 |
+
embeddings = OpenAIEmbeddings()
|
29 |
+
db_path = os.path.join(file_Directory,"vectorstore")
|
30 |
+
db = Chroma(persist_directory= db_path, embedding_function= embeddings)
|
31 |
+
embedding_vector = OpenAIEmbeddings().embed_query(query)
|
32 |
+
docs = db.similarity_search_by_vector(embedding_vector)
|
33 |
+
print(docs[0].page_content)
|
34 |
+
return docs[0].page_content
|
35 |
+
|
36 |
+
|
37 |
+
def get_detail_df(name):
|
38 |
+
df = pd.read_excel(r"/home/vrush/Catalog-Digitization-/src/module/data/Catalog Digitization/ONDC Test Data _ Images/ONDCSampleData.xlsx")
|
39 |
+
for item in df.iterrows():
|
40 |
+
if item['name'] == name:
|
41 |
+
return item
|
42 |
+
else:
|
43 |
+
return None
|
44 |
+
|
45 |
+
if __name__ == "__main__":
|
46 |
+
create_vector()
|
47 |
+
name = search("Choco Creme Wafers")
|
48 |
+
print(get_detail_df(name))
|