dsmultimedika commited on
Commit
9002555
1 Parent(s): b3ea108

Build Application

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.gitignore ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Ignore Visual Studio temporary files, build results, and
2
+ ## files generated by popular Visual Studio add-ons.
3
+ ##
4
+ ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
5
+
6
+ # User-specific files
7
+ *.rsuser
8
+ *.suo
9
+ *.user
10
+ *.userosscache
11
+ *.sln.docstates
12
+
13
+ # User-specific files (MonoDevelop/Xamarin Studio)
14
+ *.userprefs
15
+
16
+ # Mono auto generated files
17
+ mono_crash.*
18
+
19
+ # Build results
20
+ [Dd]ebug/
21
+ [Dd]ebugPublic/
22
+ [Rr]elease/
23
+ [Rr]eleases/
24
+ x64/
25
+ x86/
26
+ [Ww][Ii][Nn]32/
27
+ [Aa][Rr][Mm]/
28
+ [Aa][Rr][Mm]64/
29
+ bld/
30
+ [Bb]in/
31
+ [Oo]bj/
32
+ [Ll]og/
33
+ [Ll]ogs/
34
+
35
+ # Visual Studio 2015/2017 cache/options directory
36
+ .vs/
37
+ # Uncomment if you have tasks that create the project's static files in wwwroot
38
+ #wwwroot/
39
+
40
+ # Visual Studio 2017 auto generated files
41
+ Generated\ Files/
42
+
43
+ # MSTest test Results
44
+ [Tt]est[Rr]esult*/
45
+ [Bb]uild[Ll]og.*
46
+
47
+ # NUnit
48
+ *.VisualState.xml
49
+ TestResult.xml
50
+ nunit-*.xml
51
+
52
+ # Build Results of an ATL Project
53
+ [Dd]ebugPS/
54
+ [Rr]eleasePS/
55
+ dlldata.c
56
+
57
+ # Benchmark Results
58
+ BenchmarkDotNet.Artifacts/
59
+
60
+ # .NET Core
61
+ project.lock.json
62
+ project.fragment.lock.json
63
+ artifacts/
64
+
65
+ # ASP.NET Scaffolding
66
+ ScaffoldingReadMe.txt
67
+
68
+ # StyleCop
69
+ StyleCopReport.xml
70
+
71
+ # Files built by Visual Studio
72
+ *_i.c
73
+ *_p.c
74
+ *_h.h
75
+ *.ilk
76
+ *.meta
77
+ *.obj
78
+ *.iobj
79
+ *.pch
80
+ *.pdb
81
+ *.ipdb
82
+ *.pgc
83
+ *.pgd
84
+ *.rsp
85
+ *.sbr
86
+ *.tlb
87
+ *.tli
88
+ *.tlh
89
+ *.tmp
90
+ *.tmp_proj
91
+ *_wpftmp.csproj
92
+ *.log
93
+ *.tlog
94
+ *.vspscc
95
+ *.vssscc
96
+ .builds
97
+ *.pidb
98
+ *.svclog
99
+ *.scc
100
+
101
+ # Chutzpah Test files
102
+ _Chutzpah*
103
+
104
+ # Visual C++ cache files
105
+ ipch/
106
+ *.aps
107
+ *.ncb
108
+ *.opendb
109
+ *.opensdf
110
+ *.sdf
111
+ *.cachefile
112
+ *.VC.db
113
+ *.VC.VC.opendb
114
+
115
+ # Visual Studio profiler
116
+ *.psess
117
+ *.vsp
118
+ *.vspx
119
+ *.sap
120
+
121
+ # Visual Studio Trace Files
122
+ *.e2e
123
+
124
+ # TFS 2012 Local Workspace
125
+ $tf/
126
+
127
+ # Guidance Automation Toolkit
128
+ *.gpState
129
+
130
+ # ReSharper is a .NET coding add-in
131
+ _ReSharper*/
132
+ *.[Rr]e[Ss]harper
133
+ *.DotSettings.user
134
+
135
+ # TeamCity is a build add-in
136
+ _TeamCity*
137
+
138
+ # DotCover is a Code Coverage Tool
139
+ *.dotCover
140
+
141
+ # AxoCover is a Code Coverage Tool
142
+ .axoCover/*
143
+ !.axoCover/settings.json
144
+
145
+ # Coverlet is a free, cross platform Code Coverage Tool
146
+ coverage*.json
147
+ coverage*.xml
148
+ coverage*.info
149
+
150
+ # Visual Studio code coverage results
151
+ *.coverage
152
+ *.coveragexml
153
+
154
+ # NCrunch
155
+ _NCrunch_*
156
+ .*crunch*.local.xml
157
+ nCrunchTemp_*
158
+
159
+ # MightyMoose
160
+ *.mm.*
161
+ AutoTest.Net/
162
+
163
+ # Web workbench (sass)
164
+ .sass-cache/
165
+
166
+ # Installshield output folder
167
+ [Ee]xpress/
168
+
169
+ # DocProject is a documentation generator add-in
170
+ DocProject/buildhelp/
171
+ DocProject/Help/*.HxT
172
+ DocProject/Help/*.HxC
173
+ DocProject/Help/*.hhc
174
+ DocProject/Help/*.hhk
175
+ DocProject/Help/*.hhp
176
+ DocProject/Help/Html2
177
+ DocProject/Help/html
178
+
179
+ # Click-Once directory
180
+ publish/
181
+
182
+ # Publish Web Output
183
+ *.[Pp]ublish.xml
184
+ *.azurePubxml
185
+ # Note: Comment the next line if you want to checkin your web deploy settings,
186
+ # but database connection strings (with potential passwords) will be unencrypted
187
+ *.pubxml
188
+ *.publishproj
189
+
190
+ # Microsoft Azure Web App publish settings. Comment the next line if you want to
191
+ # checkin your Azure Web App publish settings, but sensitive information contained
192
+ # in these scripts will be unencrypted
193
+ PublishScripts/
194
+
195
+ # NuGet Packages
196
+ *.nupkg
197
+ # NuGet Symbol Packages
198
+ *.snupkg
199
+ # The packages folder can be ignored because of Package Restore
200
+ **/[Pp]ackages/*
201
+ # except build/, which is used as an MSBuild target.
202
+ !**/[Pp]ackages/build/
203
+ # Uncomment if necessary however generally it will be regenerated when needed
204
+ #!**/[Pp]ackages/repositories.config
205
+ # NuGet v3's project.json files produces more ignorable files
206
+ *.nuget.props
207
+ *.nuget.targets
208
+
209
+ # Microsoft Azure Build Output
210
+ csx/
211
+ *.build.csdef
212
+
213
+ # Microsoft Azure Emulator
214
+ ecf/
215
+ rcf/
216
+
217
+ # Windows Store app package directories and files
218
+ AppPackages/
219
+ BundleArtifacts/
220
+ Package.StoreAssociation.xml
221
+ _pkginfo.txt
222
+ *.appx
223
+ *.appxbundle
224
+ *.appxupload
225
+
226
+ # Visual Studio cache files
227
+ # files ending in .cache can be ignored
228
+ *.[Cc]ache
229
+ # but keep track of directories ending in .cache
230
+ !?*.[Cc]ache/
231
+
232
+ # Others
233
+ ClientBin/
234
+ ~$*
235
+ *~
236
+ *.dbmdl
237
+ *.dbproj.schemaview
238
+ *.jfm
239
+ *.pfx
240
+ *.publishsettings
241
+ orleans.codegen.cs
242
+
243
+ # Including strong name files can present a security risk
244
+ # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245
+ #*.snk
246
+
247
+ # Since there are multiple workflows, uncomment next line to ignore bower_components
248
+ # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249
+ #bower_components/
250
+
251
+ # RIA/Silverlight projects
252
+ Generated_Code/
253
+
254
+ # Backup & report files from converting an old project file
255
+ # to a newer Visual Studio version. Backup files are not needed,
256
+ # because we have git ;-)
257
+ _UpgradeReport_Files/
258
+ Backup*/
259
+ UpgradeLog*.XML
260
+ UpgradeLog*.htm
261
+ ServiceFabricBackup/
262
+ *.rptproj.bak
263
+
264
+ # SQL Server files
265
+ *.mdf
266
+ *.ldf
267
+ *.ndf
268
+
269
+ # Business Intelligence projects
270
+ *.rdl.data
271
+ *.bim.layout
272
+ *.bim_*.settings
273
+ *.rptproj.rsuser
274
+ *- [Bb]ackup.rdl
275
+ *- [Bb]ackup ([0-9]).rdl
276
+ *- [Bb]ackup ([0-9][0-9]).rdl
277
+
278
+ # Microsoft Fakes
279
+ FakesAssemblies/
280
+
281
+ # GhostDoc plugin setting file
282
+ *.GhostDoc.xml
283
+
284
+ # Node.js Tools for Visual Studio
285
+ .ntvs_analysis.dat
286
+ node_modules/
287
+
288
+ # Visual Studio 6 build log
289
+ *.plg
290
+
291
+ # Visual Studio 6 workspace options file
292
+ *.opt
293
+
294
+ # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295
+ *.vbw
296
+
297
+ # Visual Studio 6 auto-generated project file (contains which files were open etc.)
298
+ *.vbp
299
+
300
+ # Visual Studio 6 workspace and project file (working project files containing files to include in project)
301
+ *.dsw
302
+ *.dsp
303
+
304
+ # Visual Studio 6 technical files
305
+ *.ncb
306
+ *.aps
307
+
308
+ # Visual Studio LightSwitch build output
309
+ **/*.HTMLClient/GeneratedArtifacts
310
+ **/*.DesktopClient/GeneratedArtifacts
311
+ **/*.DesktopClient/ModelManifest.xml
312
+ **/*.Server/GeneratedArtifacts
313
+ **/*.Server/ModelManifest.xml
314
+ _Pvt_Extensions
315
+
316
+ # Paket dependency manager
317
+ .paket/paket.exe
318
+ paket-files/
319
+
320
+ # FAKE - F# Make
321
+ .fake/
322
+
323
+ # CodeRush personal settings
324
+ .cr/personal
325
+
326
+ # Python Tools for Visual Studio (PTVS)
327
+ __pycache__/
328
+ *.pyc
329
+
330
+ # Cake - Uncomment if you are using it
331
+ # tools/**
332
+ # !tools/packages.config
333
+
334
+ # Tabs Studio
335
+ *.tss
336
+
337
+ # Telerik's JustMock configuration file
338
+ *.jmconfig
339
+
340
+ # BizTalk build output
341
+ *.btp.cs
342
+ *.btm.cs
343
+ *.odx.cs
344
+ *.xsd.cs
345
+
346
+ # OpenCover UI analysis results
347
+ OpenCover/
348
+
349
+ # Azure Stream Analytics local run output
350
+ ASALocalRun/
351
+
352
+ # MSBuild Binary and Structured Log
353
+ *.binlog
354
+
355
+ # NVidia Nsight GPU debugger configuration file
356
+ *.nvuser
357
+
358
+ # MFractors (Xamarin productivity tool) working folder
359
+ .mfractor/
360
+
361
+ # Local History for Visual Studio
362
+ .localhistory/
363
+
364
+ # Visual Studio History (VSHistory) files
365
+ .vshistory/
366
+
367
+ # BeatPulse healthcheck temp database
368
+ healthchecksdb
369
+
370
+ # Backup folder for Package Reference Convert tool in Visual Studio 2017
371
+ MigrationBackup/
372
+
373
+ # Ionide (cross platform F# VS Code tools) working folder
374
+ .ionide/
375
+
376
+ # Fody - auto-generated XML schema
377
+ FodyWeavers.xsd
378
+
379
+ # VS Code files for those working on multiple tools
380
+ .vscode/*
381
+ !.vscode/settings.json
382
+ !.vscode/tasks.json
383
+ !.vscode/launch.json
384
+ !.vscode/extensions.json
385
+ *.code-workspace
386
+
387
+ # Local History for Visual Studio Code
388
+ .history/
389
+
390
+ # Windows Installer files from build outputs
391
+ *.cab
392
+ *.msi
393
+ *.msix
394
+ *.msm
395
+ *.msp
396
+
397
+ # JetBrains Rider
398
+ *.sln.iml
399
+
400
+ .env
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python image from Docker Hub
2
+ FROM python:3.11
3
+
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+ ENV PATH="/home/user/.local/bin:$PATH"
7
+
8
+ WORKDIR /app
9
+
10
+ COPY --chown=user ./requirements.txt requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
12
+
13
+ COPY --chown=user . /app
14
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
api/__init__.py ADDED
File without changes
api/auth.py ADDED
File without changes
api/events.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from api.router.topic import db_conn
3
+ from llama_index.core import set_global_handler
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+
8
+ load_dotenv()
9
+
10
+
11
+ async def startup() -> None:
12
+ await db_conn.connect()
13
+ os.environ["LANGFUSE_SECRET_KEY"] = os.getenv("LANGFUSE_SECRET_KEY")
14
+ os.environ["LANGFUSE_PUBLIC_KEY"] = os.getenv("LANGFUSE_PUBLIC_KEY")
15
+ os.environ["LANGFUSE_HOST"] = os.getenv("LANGFUSE_HOST")
16
+ set_global_handler("langfuse")
17
+
18
+
19
+ async def shutdown() -> None:
20
+ await db_conn.disconnect()
21
+
22
+
23
+ def register_events(app: FastAPI) -> FastAPI:
24
+ app.add_event_handler("startup", startup)
25
+ app.add_event_handler("shutdown", shutdown)
26
+ return app
api/function.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from script.build_vector import IndexManager
2
+ from script.document_uploader import Uploader
3
+ from db.save_data import InsertDatabase
4
+ from db.get_data import GetDatabase
5
+ from db.delete_data import DeleteDatabase
6
+ from db.update_data import UpdateDatabase
7
+ from typing import Any
8
+ from fastapi import UploadFile
9
+ from fastapi import HTTPException
10
+ from core.chat.engine import Engine
11
+ from core.parser import clean_text, update_response, renumber_sources, seperate_to_list
12
+ from llama_index.core.composability import QASummaryQueryEngineBuilder
13
+ from service.dto import BotResponseStreaming, TestStreaming
14
+ from service.aws_loader import Loader
15
+
16
+ import logging
17
+ import re
18
+
19
+
20
+ # Configure logging
21
+ logging.basicConfig(level=logging.INFO)
22
+
23
+
24
+ # async def data_ingestion(
25
+ # db_conn, reference, file: UploadFile, content_table: UploadFile
26
+ # ) -> Any:
27
+
28
+ async def data_ingestion(
29
+ db_conn, reference, file: UploadFile
30
+ ) -> Any:
31
+
32
+ insert_database = InsertDatabase(db_conn)
33
+
34
+ file_name = f"{reference['title']}.pdf"
35
+ aws_loader = Loader()
36
+
37
+ file_obj = file
38
+ aws_loader.upload_to_s3(file_obj, file_name)
39
+
40
+ print("Uploaded Success")
41
+
42
+ try:
43
+ # Insert data into the database
44
+ await insert_database.insert_data(reference)
45
+
46
+ # uploader = Uploader(reference, file, content_table)
47
+ uploader = Uploader(reference, file)
48
+ print("uploader : ", uploader)
49
+
50
+ nodes_with_metadata = await uploader.process_documents()
51
+
52
+ # Build indexes using IndexManager
53
+ index = IndexManager()
54
+ response = index.build_indexes(nodes_with_metadata)
55
+
56
+ return response
57
+
58
+ except Exception as e:
59
+ # Log the error and raise HTTPException for FastAPI
60
+ logging.error(f"An error occurred in data ingestion: {e}")
61
+ raise HTTPException(
62
+ status_code=500,
63
+ detail="An internal server error occurred in data ingestion.",
64
+ )
65
+
66
+ async def get_data(db_conn, title="", fetch_all_data=True):
67
+ get_database = GetDatabase(db_conn)
68
+ print(get_database)
69
+ try:
70
+ if fetch_all_data:
71
+ results = await get_database.get_all_data()
72
+ print(results)
73
+ logging.info("Database fetched all data")
74
+ return results
75
+ else:
76
+ results = await get_database.get_data(title)
77
+ logging.info("Database fetched one data")
78
+ return results
79
+
80
+ except Exception as e:
81
+ # Log the error and raise HTTPException for FastAPI
82
+ logging.error(f"An error occurred in get data.: {e}")
83
+ raise HTTPException(
84
+ status_code=500, detail="An internal server error occurred in get data."
85
+ )
86
+
87
+
88
+ async def update_data(id: int, reference, db_conn):
89
+ update_database = UpdateDatabase(db_conn)
90
+ try:
91
+ reference = reference.model_dump()
92
+ print(reference)
93
+ reference.update({"id": id})
94
+ print(reference)
95
+ await update_database.update_record(reference)
96
+ response = {"status": "Update Success"}
97
+ return response
98
+ except Exception as e:
99
+ # Log the error and raise HTTPException for FastAPI
100
+ logging.error(f"An error occurred in update data.: {e}")
101
+ raise HTTPException(
102
+ status_code=500, detail="An internal server error occurred in update data."
103
+ )
104
+
105
+
106
+ async def delete_data(id: int, db_conn):
107
+ delete_database = DeleteDatabase(db_conn)
108
+ try:
109
+ params = {"id": id}
110
+ await delete_database.delete_record(params)
111
+ response = {"status": "Delete Success"}
112
+ return response
113
+ except Exception as e:
114
+ # Log the error and raise HTTPException for FastAPI
115
+ logging.error(f"An error occurred in get data.: {e}")
116
+ raise HTTPException(
117
+ status_code=500, detail="An internal server error occurred in delete data."
118
+ )
119
+
120
+
121
+ def generate_completion_non_streaming(user_request, chat_engine):
122
+ try:
123
+ engine = Engine()
124
+ index_manager = IndexManager()
125
+
126
+ # Load existing indexes
127
+ index = index_manager.load_existing_indexes()
128
+
129
+ # Retrieve the chat engine with the loaded index
130
+ chat_engine = engine.get_chat_engine(index)
131
+
132
+ # Generate completion response
133
+ response = chat_engine.chat(user_request)
134
+
135
+ sources = response.sources
136
+
137
+ number_reference = list(set(re.findall(r"\[(\d+)\]", str(response))))
138
+ number_reference_sorted = sorted(number_reference)
139
+
140
+ contents = []
141
+ raw_contents = []
142
+ metadata_collection = []
143
+ scores = []
144
+
145
+ if number_reference_sorted:
146
+ for number in number_reference_sorted:
147
+ # Konversi number ke integer untuk digunakan sebagai indeks
148
+ number = int(number)
149
+
150
+ # Pastikan sources tidak kosong dan memiliki elemen yang diperlukan
151
+ if sources and len(sources) > 0:
152
+ node = dict(sources[0])["raw_output"].source_nodes
153
+
154
+ # Pastikan number valid sebagai indeks
155
+ if 0 <= number - 1 < len(node):
156
+ print(node[number - 1].node.get_text())
157
+ raw_content = seperate_to_list(
158
+ node[number - 1].node.get_text()
159
+ )
160
+ raw_contents.append(raw_content)
161
+
162
+ content = clean_text(node[number - 1].node.get_text())
163
+ contents.append(content)
164
+
165
+ metadata = dict(node[number - 1].node.metadata)
166
+ metadata_collection.append(metadata)
167
+
168
+ score = node[number - 1].score
169
+ scores.append(score)
170
+ else:
171
+ print(f"Invalid reference number: {number}")
172
+ else:
173
+ print("No sources available")
174
+ else:
175
+ print("There are no references")
176
+
177
+ response = update_response(str(response))
178
+ contents = renumber_sources(contents)
179
+
180
+ # Check the lengths of content and metadata
181
+ num_content = len(contents)
182
+ num_metadata = len(metadata_collection)
183
+
184
+ # Add content to metadata
185
+ for i in range(min(num_content, num_metadata)):
186
+ metadata_collection[i]["content"] = re.sub(r"source \d+\:", "", contents[i])
187
+
188
+ return str(response), raw_contents, contents, metadata_collection, scores
189
+ except Exception as e:
190
+ # Log the error and raise HTTPException for FastAPI
191
+ logging.error(f"An error occurred in generate text: {e}")
192
+ raise HTTPException(
193
+ status_code=500,
194
+ detail="An internal server error occurred in generate text.",
195
+ )
196
+
197
+
198
+ async def generate_streaming_completion(user_request, chat_engine):
199
+ try:
200
+ engine = Engine()
201
+ index_manager = IndexManager()
202
+
203
+ # Load existing indexes
204
+ index = index_manager.load_existing_indexes()
205
+
206
+ # Retrieve the chat engine with the loaded index
207
+ chat_engine = engine.get_chat_engine(index)
208
+ # Generate completion response
209
+ response = chat_engine.stream_chat(user_request)
210
+
211
+ completed_response = ""
212
+
213
+ for gen in response.response_gen:
214
+ completed_response += gen # Concatenate the new string
215
+ yield BotResponseStreaming(
216
+ content=gen, completed_content=completed_response
217
+ )
218
+
219
+ nodes = response.source_nodes
220
+ for node in nodes:
221
+ reference = str(clean_text(node.node.get_text()))
222
+ metadata = dict(node.node.metadata)
223
+ score = float(node.score)
224
+ yield BotResponseStreaming(
225
+ completed_content=completed_response,
226
+ reference=reference,
227
+ metadata=metadata,
228
+ score=score,
229
+ )
230
+ except Exception as e:
231
+ yield {"error": str(e)}
232
+
233
+ except Exception as e:
234
+ # Log the error and raise HTTPException for FastAPI
235
+ logging.error(f"An error occurred in generate text: {e}")
236
+ raise HTTPException(
237
+ status_code=500,
238
+ detail="An internal server error occurred in generate text.",
239
+ )
api/router/__init__.py ADDED
File without changes
api/router/bot.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from service.dto import UserPromptRequest, BotResponse
3
+
4
+ from api.function import (
5
+ generate_streaming_completion,
6
+ generate_completion_non_streaming,
7
+ )
8
+ from sse_starlette.sse import EventSourceResponse
9
+
10
+ router = APIRouter(tags=["Bot"])
11
+
12
+
13
+ @router.post("/bot")
14
+ async def bot_generator_general(user_prompt_request: UserPromptRequest):
15
+
16
+ if user_prompt_request.streaming:
17
+ return EventSourceResponse(
18
+ generate_streaming_completion(
19
+ user_prompt_request.prompt, user_prompt_request.streaming
20
+ )
21
+ )
22
+ else:
23
+ response, raw_references, references, metadata, scores = (
24
+ generate_completion_non_streaming(
25
+ user_prompt_request.prompt, user_prompt_request.streaming
26
+ )
27
+ )
28
+
29
+ return BotResponse(
30
+ content=response,
31
+ raw_references=raw_references,
32
+ references=references,
33
+ metadata=metadata,
34
+ scores=scores,
35
+ )
36
+
37
+
38
+ @router.post("/bot/{category_id}/{title}")
39
+ async def bot_generator_spesific(
40
+ category_id: int, title: str, user_prompt_request: UserPromptRequest
41
+ ):
42
+ pass
43
+
44
+
45
+ @router.get("/bot/{category_id}/{title}")
46
+ async def get_favourite_data(category_id: int, title: str, human_template):
47
+ pass
api/router/health.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import Request
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.routing import APIRouter
4
+
5
+ router = APIRouter(tags=["Health"])
6
+
7
+
8
+ @router.get("/_health")
9
+ async def health(request: Request):
10
+ return JSONResponse(dict(status="OK"), status_code=200)
api/router/role.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+
4
+ router = APIRouter(tags=["Roles"])
5
+
6
+ @router.get("/roles")
7
+ async def get_data_roles():
8
+ pass
9
+
10
+
11
+ @router.post("/roles")
12
+ async def add_data_roles():
13
+ pass
14
+ @router.put("/roles/{id}")
15
+ async def update_data_roles():
16
+ pass
17
+
18
+ @router.delete("/roles/{id}")
19
+ async def remove_data_roles():
20
+ pass
api/router/topic.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import Form, APIRouter, File, UploadFile, HTTPException, Request
2
+ from db.repository import get_db_conn
3
+ from config import MYSQL_CONFIG
4
+ from api.function import data_ingestion, get_data, delete_data, update_data
5
+ from service.dto import MetadataRequest
6
+
7
+ router = APIRouter(tags=["Topics"])
8
+
9
+ db_conn = get_db_conn(MYSQL_CONFIG)
10
+
11
+ @router.post("/topic")
12
+ async def upload_file(
13
+ title: str = Form(...),
14
+ author: str = Form(...),
15
+ category: str = Form(...),
16
+ year: int = Form(...),
17
+ publisher: str = Form(...),
18
+ file: UploadFile = File(...),
19
+ # content_table: UploadFile = File(...)
20
+ ):
21
+
22
+ reference = {
23
+ "title": title,
24
+ "author": author,
25
+ "category": category,
26
+ "year": year,
27
+ "publisher": publisher,
28
+ }
29
+
30
+ # response = await data_ingestion(db_conn, reference, file, content_table)
31
+ response = await data_ingestion(db_conn, reference, file)
32
+ return {"filename": file.filename, "response": response}
33
+
34
+
35
+ @router.get("/topic")
36
+ async def get_metadata():
37
+ results = await get_data(db_conn)
38
+ return results
39
+
40
+
41
+ @router.put("/topic/{id}")
42
+ async def update_metadata(id: int, reference: MetadataRequest):
43
+ response = await update_data(id, reference, db_conn)
44
+ return response
45
+
46
+
47
+ @router.delete("/topic/{id}")
48
+ async def delete_metadata(id: int):
49
+ response = await delete_data(id, db_conn)
50
+ return response
api/router/trial.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+
4
+ router = APIRouter(tags=["Trial"])
5
+
6
+ @router.get("/roles")
7
+ async def get_trial_data():
8
+ pass
9
+
10
+
11
+ @router.post("/roles")
12
+ async def add_trial_data():
13
+ pass
14
+ @router.put("/roles/{id}")
15
+ async def update_trial_data():
16
+ pass
17
+
18
+ @router.delete("/roles/{id}")
19
+ async def remove_trial_data():
20
+ pass
api/router/user.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+
4
+ router = APIRouter(tags=["User"])
5
+
6
+ @router.post("/login")
7
+ async def get_data_roles():
8
+ pass
9
+
10
+
11
+ @router.post("/register")
12
+ async def register_user():
13
+ pass
14
+ @router.post("/forgot_password")
15
+ async def forget_password():
16
+ pass
17
+
18
+ @router.post("/change_password")
19
+ async def change_password():
20
+ pass
api/util/util.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi.applications import FastAPI
2
+ from api.router import health, topic, user, bot, trial, role
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from api.events import register_events
5
+ from utils.utils import pipe
6
+
7
+ def create_instance() -> FastAPI:
8
+ return FastAPI()
9
+
10
+ def add_middleware(app: FastAPI) -> FastAPI:
11
+ app.add_middleware(
12
+ CORSMiddleware,
13
+ allow_origins=["*"],
14
+ allow_credentials=True,
15
+ allow_methods=["*"],
16
+ allow_headers=["*"],
17
+ )
18
+ return app
19
+
20
+ def init_database(app: FastAPI) -> FastAPI:
21
+ return app
22
+
23
+
24
+ def register_routers(app: FastAPI) -> FastAPI:
25
+ app.include_router(user.router)
26
+ app.include_router(topic.router)
27
+ app.include_router(bot.router)
28
+ app.include_router(trial.router)
29
+ app.include_router(role.router)
30
+ app.include_router(health.router)
31
+
32
+ return app
33
+
34
+
35
+ def init_app() -> FastAPI:
36
+ app: FastAPI = pipe(
37
+ create_instance(), add_middleware, init_database, register_events, register_routers
38
+ )
39
+ return app
40
+
41
+ app = init_app()
config.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+ import os
3
+
4
+
5
+ class MysqlConfig(BaseSettings):
6
+ DB_HOST: str = ""
7
+ DB_PORT: str = "10707" # Default MySQL port
8
+ DB_URI: str = ""
9
+ DB_USERNAME: str = ""
10
+ DB_PASSWORD: str = ""
11
+ DB_NAME: str = ""
12
+
13
+ class Config:
14
+ env_file = ".env"
15
+ env_file_encoding = "utf-8"
16
+ extra = "allow" # Allow extra fields
17
+
18
+
19
+ class PineconeConfig(BaseSettings):
20
+ PINECONE_API_KEY: str = ""
21
+
22
+ class Config:
23
+ env_file = ".env"
24
+ env_file_encoding = "utf-8"
25
+ extra = "allow" # Allow extra fields
26
+
27
+ class GPTBotConfig(BaseSettings):
28
+ temperature : float = 0.3
29
+ model : str = "gpt-4o-mini"
30
+ max_tokens : int = 512
31
+ streaming : bool = False
32
+ api_key : str = os.environ.get("OPENAI_API_KEY")
33
+
34
+ # Load configuration
35
+ MYSQL_CONFIG = MysqlConfig()
36
+ PINECONE_CONFIG = PineconeConfig()
37
+ GPTBOT_CONFIG = GPTBotConfig()
core/__init__.py ADDED
File without changes
core/book_enabler/__init__.py ADDED
File without changes
core/chat/__init__.py ADDED
File without changes
core/chat/engine.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ from llama_index.core.vector_stores import (
3
+ MetadataFilter,
4
+ MetadataFilters,
5
+ FilterCondition,
6
+ )
7
+
8
+ from llama_index.core.memory import ChatMemoryBuffer
9
+ from llama_index.core.tools import QueryEngineTool, ToolMetadata
10
+ from llama_index.agent.openai import OpenAIAgent
11
+ from llama_index.llms.openai import OpenAI
12
+ from llama_index.storage.chat_store.redis import RedisChatStore
13
+ from llama_index.core.storage.chat_store import SimpleChatStore
14
+ from llama_index.core.memory import ChatMemoryBuffer
15
+ from llama_index.core.query_engine import CitationQueryEngine
16
+ from llama_index.core import Settings
17
+
18
+ from config import GPTBOT_CONFIG
19
+ from core.prompt import SYSTEM_BOT_TEMPLATE
20
+ import redis
21
+ import os
22
+
23
+
24
+ class Engine:
25
+ def __init__(self):
26
+ self.llm = OpenAI(
27
+ temperature=GPTBOT_CONFIG.temperature,
28
+ model=GPTBOT_CONFIG.model,
29
+ max_tokens=GPTBOT_CONFIG.max_tokens,
30
+ api_key=GPTBOT_CONFIG.api_key,
31
+ )
32
+
33
+ Settings.llm = self.llm
34
+
35
+ def initialize_memory_bot(self, user_id = "1"):
36
+ redis_client = redis.Redis(
37
+ host="redis-10365.c244.us-east-1-2.ec2.redns.redis-cloud.com",
38
+ port=10365,
39
+ password=os.environ.get("REDIS_PASSWORD"),
40
+ )
41
+ # chat_store = SimpleChatStore()
42
+ chat_store = RedisChatStore(redis_client=redis_client, ttl=3600) # Need to be configured
43
+ memory = ChatMemoryBuffer.from_defaults(
44
+ token_limit=3000, chat_store=chat_store, chat_store_key=user_id
45
+ )
46
+
47
+ return memory
48
+
49
+ def _build_description_bot(self, title, category):
50
+ try:
51
+ prompt = f"Write a detailed description for an OpenAI agent with the title '{title}' and categorized under '{category}'."
52
+ description = self.llm.complete(prompt)
53
+
54
+ return description
55
+
56
+ except Exception as e:
57
+ return f"Error generating description: {str(e)}"
58
+
59
+ def index_to_query_engine(self, title, category, index):
60
+ filters = MetadataFilters(
61
+ filters=[
62
+ MetadataFilter(key="title", value=title),
63
+ MetadataFilter(key="category", value=category),
64
+ ],
65
+ condition=FilterCondition.AND,
66
+ )
67
+
68
+ # Create the QueryEngineTool with the index and filters
69
+ kwargs = {"similarity_top_k": 5, "filters": filters}
70
+
71
+ query_engine = index.as_query_engine(**kwargs)
72
+
73
+ return query_engine
74
+
75
+ def get_citation_engine(self, title, category, index):
76
+ filters = MetadataFilters(
77
+ filters=[
78
+ MetadataFilter(key="title", value=title),
79
+ MetadataFilter(key="category", value=category),
80
+ ],
81
+ condition=FilterCondition.AND,
82
+ )
83
+
84
+ # Create the QueryEngineTool with the index and filters
85
+ kwargs = {"similarity_top_k": 5, "filters": filters}
86
+
87
+ retriever = index.as_retriever(**kwargs)
88
+
89
+ citation_engine = CitationQueryEngine(retriever=retriever)
90
+
91
+ return citation_engine
92
+
93
+
94
+ def get_chat_engine(self, index, title=None, category=None, type="general"):
95
+ # Define the metadata for the QueryEngineTool
96
+
97
+ # Create the QueryEngineTool based on the type
98
+ if type == "general":
99
+ # query_engine = index.as_query_engine(similarity_top_k=3)
100
+ citation_engine = CitationQueryEngine.from_args(index, similarity_top_k=5)
101
+ description = "A book containing information about medicine"
102
+ else:
103
+ query_engine = self.index_to_query_engine(title, category, index)
104
+ citation_engine = self.get_citation_engine(title, category, index)
105
+ description = self._build_description_bot()
106
+
107
+ metadata = ToolMetadata(
108
+ name="bot-belajar",
109
+ description=description
110
+ )
111
+ print(metadata)
112
+
113
+ vector_query_engine = QueryEngineTool(
114
+ query_engine=citation_engine,
115
+ metadata=metadata
116
+ )
117
+ print(vector_query_engine)
118
+
119
+ # Initialize the OpenAI agent with the tools
120
+ chat_engine = OpenAIAgent.from_tools(
121
+ tools=[vector_query_engine],
122
+ llm=self.llm,
123
+ memory=self.initialize_memory_bot(),
124
+ system_prompt=SYSTEM_BOT_TEMPLATE,
125
+ )
126
+
127
+ return chat_engine
128
+
129
+ def get_chat_history(self):
130
+ pass
core/chat/messaging.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Experimental
2
+
3
+ from typing import Dict, Any, Optional, List
4
+ import asyncio
5
+ import logging
6
+ from uuid import uuid4
7
+ from anyio import ClosedResourceError
8
+ from anyio.streams.memory import MemoryObjectSendStream
9
+
10
+ from llama_index.core.callbacks.base import BaseCallbackHandler, CallbackManager
11
+ from llama_index.core.callbacks import CBEventType, EventPayload
12
+ from llama_index.core.query_engine.sub_question_query_engine import (
13
+ SubQuestionAnswerPair,
14
+ )
15
+ from llama_index.core.chat_engine.types import StreamingAgentChatResponse
16
+ from pydantic import BaseModel
17
+
18
+ from core.chat import schema
19
+
20
+ from db.db import MessageSubProcessSourceEnum
21
+ from core.chat.schema import SubProcessMetadataKeysEnum, SubProcessMetadataMap
22
+ from core.chat.engine import Engine
23
+ from script.build_vector import IndexManager
24
+ from service.dto import UserPromptRequest
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ class StreamedMessage(BaseModel):
29
+ content: str
30
+
31
+ async def handle_chat_message(
32
+ user_message: str,
33
+ send_chan: MemoryObjectSendStream,
34
+ ) -> None:
35
+ async with send_chan:
36
+ engine = Engine()
37
+
38
+ index_manager = IndexManager()
39
+ index = index_manager.load_existing_indexes()
40
+
41
+ # Retrieve the chat engine with the loaded index
42
+ chat_engine = await engine.get_chat_engine(index)
43
+
44
+ logger.debug("Engine received")
45
+ streaming_chat_response: StreamingAgentChatResponse = (
46
+ await chat_engine.astream_chat(user_message)
47
+ )
48
+ response_str = ""
49
+ async for text in streaming_chat_response.async_response_gen():
50
+ response_str += text
51
+ if send_chan._closed:
52
+ logger.debug(
53
+ "Received streamed token after send channel closed. Ignoring."
54
+ )
55
+ return
56
+ await send_chan.send(StreamedMessage(content=response_str))
57
+
58
+ if response_str.strip() == "":
59
+ await send_chan.send(
60
+ StreamedMessage(
61
+ content="Sorry, I either wasn't able to understand your question or I don't have an answer for it."
62
+ )
63
+ )
core/chat/schema.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Experimental
2
+
3
+ from pydantic import BaseModel, Field, field_validator
4
+ from typing import List, Optional, Dict, Union, Any
5
+ from enum import Enum
6
+ from uuid import UUID
7
+ from datetime import datetime
8
+ from llama_index.core.schema import BaseNode, NodeWithScore
9
+ from llama_index.core.callbacks.schema import EventPayload
10
+ from llama_index.core.query_engine.sub_question_query_engine import SubQuestionAnswerPair
11
+ from db.db import (
12
+ MessageRoleEnum,
13
+ MessageStatusEnum,
14
+ MessageSubProcessSourceEnum,
15
+ MessageSubProcessStatusEnum,
16
+ )
17
+
18
+ DB_DOC_ID_KEY = "db_document_id"
19
+
20
+ class Base(BaseModel):
21
+ id: Optional[UUID] = Field(None, description="Unique identifier")
22
+ created_at: Optional[datetime] = Field(None, description="Creation datetime")
23
+ updated_at: Optional[datetime] = Field(None, description="Update datetime")
24
+
25
+ class Config:
26
+ orm_mode = True
27
+
28
+ class BaseMetadataObject(BaseModel):
29
+ class Config:
30
+ orm_mode = True
31
+
32
+ class Citation(BaseMetadataObject):
33
+ document_id: UUID
34
+ text: str
35
+ page_number: int
36
+ score: Optional[float]
37
+
38
+ @field_validator("document_id")
39
+ def validate_document_id(cls, value):
40
+ if value:
41
+ return str(value)
42
+ return value
43
+
44
+ @classmethod
45
+ def from_node(cls, node_w_score: NodeWithScore) -> "Citation":
46
+ node: BaseNode = node_w_score.node
47
+ page_number = int(node.source_node.metadata["page_label"])
48
+ document_id = node.source_node.metadata[""]
49
+ return cls(
50
+ document_id=document_id,
51
+ text=node.get_content(),
52
+ page_number=page_number,
53
+ score=node_w_score.score,
54
+ )
55
+
56
+
57
+ class QuestionAnswerPair(BaseMetadataObject):
58
+ """
59
+ A question-answer pair that is used to store the sub-questions and answers
60
+ """
61
+
62
+ question: str
63
+ answer: Optional[str]
64
+ citations: Optional[List[Citation]] = None
65
+
66
+ @classmethod
67
+ def from_sub_question_answer_pair(
68
+ cls, sub_question_answer_pair: SubQuestionAnswerPair
69
+ ):
70
+ if sub_question_answer_pair.sources is None:
71
+ citations = None
72
+ else:
73
+ citations = [
74
+ Citation.from_node(node_w_score)
75
+ for node_w_score in sub_question_answer_pair.sources
76
+ if node_w_score.node.source_node is not None
77
+ and DB_DOC_ID_KEY in node_w_score.node.source_node.metadata
78
+ ]
79
+ citations = citations or None
80
+ return cls(
81
+ question=sub_question_answer_pair.sub_q.sub_question,
82
+ answer=sub_question_answer_pair.answer,
83
+ citations=citations,
84
+ )
85
+
86
+
87
+ # later will be Union[QuestionAnswerPair, more to add later... ]
88
+ class SubProcessMetadataKeysEnum(str, Enum):
89
+ SUB_QUESTION = EventPayload.SUB_QUESTION.value
90
+
91
+
92
+ # keeping the typing pretty loose here, in case there are changes to the metadata data formats.
93
+ SubProcessMetadataMap = Dict[Union[SubProcessMetadataKeysEnum, str], Any]
94
+
95
+
96
+ class MessageSubProcess(Base):
97
+ message_id: UUID
98
+ source: MessageSubProcessSourceEnum
99
+ status: MessageSubProcessStatusEnum
100
+ metadata_map: Optional[SubProcessMetadataMap]
101
+
102
+
103
+ class Message(Base):
104
+ conversation_id: UUID
105
+ content: str
106
+ role: MessageRoleEnum
107
+ status: MessageStatusEnum
108
+ sub_processes: List[MessageSubProcess]
109
+
110
+
111
+ class UserMessageCreate(BaseModel):
112
+ content: str
113
+
114
+ class DocumentMetadataKeysEnum(str, Enum):
115
+ """
116
+ Enum for the keys of the metadata map for a document
117
+ """
118
+
119
+ SEC_DOCUMENT = "sec_document"
120
+
121
+
122
+ class SecDocumentTypeEnum(str, Enum):
123
+ """
124
+ Enum for the type of sec document
125
+ """
126
+
127
+ TEN_K = "10-K"
128
+ TEN_Q = "10-Q"
129
+
130
+
131
+ class SecDocumentMetadata(BaseModel):
132
+ """
133
+ Metadata for a document that is a sec document
134
+ """
135
+
136
+ company_name: str
137
+ company_ticker: str
138
+ doc_type: SecDocumentTypeEnum
139
+ year: int
140
+ quarter: Optional[int]
141
+ accession_number: Optional[str]
142
+ cik: Optional[str]
143
+ period_of_report_date: Optional[datetime]
144
+ filed_as_of_date: Optional[datetime]
145
+ date_as_of_change: Optional[datetime]
146
+
147
+
148
+ DocumentMetadataMap = Dict[Union[DocumentMetadataKeysEnum, str], Any]
149
+
150
+
151
+ class Document(Base):
152
+ url: str
153
+ metadata_map: Optional[DocumentMetadataMap] = None
154
+
155
+
156
+ class Conversation(Base):
157
+ messages: List[Message]
158
+ documents: List[Document]
159
+
160
+
161
+ class ConversationCreate(BaseModel):
162
+ document_ids: List[UUID]
core/journal_reading/__init__.py ADDED
File without changes
core/journal_reading/extractor.py ADDED
File without changes
core/journal_reading/upload.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import os
3
+ from llama_parse import LlamaParse
4
+ from llama_index.core.node_parser import SimpleNodeParser
5
+
6
+
7
+ class JournalUploader:
8
+ def __init__(self):
9
+ pass
10
+
11
+ def parser_journal(self):
12
+ if local_file_name is None:
13
+ local_file_name = "downloaded_pdf_file.pdf" # Default file name
14
+
15
+ try:
16
+ # Create a temporary directory to store the file
17
+ temp_dir = tempfile.mkdtemp()
18
+ file_path = os.path.join(temp_dir, local_file_name)
19
+
20
+ with open(file_path, 'wb') as temp_file:
21
+ self.s3_client.download_fileobj(self.bucket_name, object_name, temp_file)
22
+
23
+ documents = LlamaParse(result_type="markdown").load_data(file_path)
24
+
25
+ return documents
26
+
27
+ except Exception as e:
28
+ # Handle specific exceptions or fallback to generic one
29
+ print(f"Error reading PDF file: {e}")
30
+ raise RuntimeError(f"Failed to process the uploaded file: {e}")
core/module_creator/__init__.py ADDED
File without changes
core/parser.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def parse_topics_to_dict(text):
5
+ topics = {}
6
+ lines = text.strip().split("\n")
7
+ current_topic = None
8
+
9
+ topic_pattern = re.compile(r"^\d+\.\s+(.*)$")
10
+ sub_topic_pattern = re.compile(r"^\*\s+(.*)$")
11
+
12
+ for line in lines:
13
+ line = line.strip()
14
+ if topic_pattern.match(line):
15
+ current_topic = topic_pattern.match(line).group(1)
16
+ topics[current_topic] = []
17
+ elif sub_topic_pattern.match(line):
18
+ sub_topic = sub_topic_pattern.match(line).group(1)
19
+ if current_topic:
20
+ topics[current_topic].append(sub_topic)
21
+
22
+ print(topics)
23
+ return topics
24
+
25
+
26
+ def remove_all_sources(text):
27
+ # Construct a regular expression pattern to match all sources
28
+ pattern = r"Source \d+:(.*?)(?=Source \d+:|$)"
29
+
30
+ # Use re.DOTALL to make '.' match newlines and re.IGNORECASE for case-insensitive matching
31
+ updated_text = re.sub(pattern, "", text, flags=re.DOTALL)
32
+
33
+ return updated_text.strip()
34
+
35
+
36
+ def clean_text(text):
37
+ # Replace multiple spaces with a single space
38
+ text = re.sub(r"\s{2,}", " ", text)
39
+ # Remove newline characters that are not followed by a number (to keep lists or numbered points)
40
+ text = re.sub(r"\n(?!\s*\d)", " ", text)
41
+ # Remove unnecessary punctuation (optional, adjust as needed)
42
+ text = re.sub(r";(?=\S)", "", text)
43
+ # Optional: Remove extra spaces around certain characters
44
+ text = re.sub(r"\s*([,;])\s*", r"\1 ", text)
45
+ # Normalize whitespace to a single space
46
+ text = re.sub(r"\s+", " ", text).strip()
47
+
48
+ return text
49
+
50
+
51
+ def update_response(text):
52
+ # Find all the references in the text, e.g., [1], [3], [5]
53
+ responses = re.findall(r"\[\d+\]", text)
54
+
55
+ # Extract the numbers from the responses, and remove duplicates
56
+ ref_numbers = sorted(set(int(respon.strip("[]")) for respon in responses))
57
+
58
+ # Create a mapping from old reference numbers to new ones
59
+ ref_mapping = {old: new for new, old in enumerate(ref_numbers, start=1)}
60
+
61
+ # Replace old responses with the updated responses in the text
62
+ for old, new in ref_mapping.items():
63
+ text = re.sub(rf"\[{old}\]", f"[{new}]", text)
64
+
65
+ return text
66
+
67
+
68
+ def renumber_sources(source_list):
69
+ new_sources = []
70
+ for i, source in enumerate(source_list):
71
+ # Extract the content after the colon
72
+ content = source.split(": ", 1)[1]
73
+ # Add the new source number and content
74
+ new_sources.append(f"source {i+1}: {content}")
75
+ return new_sources
76
+
77
+
78
+ def seperate_to_list(text):
79
+ # Step 1: Split the text by line breaks (\n)
80
+ lines = text.split("\n")
81
+
82
+ # Step 2: Remove occurrences of "source (number):"
83
+ cleaned_lines = [re.sub(r"Source \d+\:", "", line) for line in lines]
84
+
85
+ # Step 3: Split all capital sentences
86
+ final_output = []
87
+ for line in cleaned_lines:
88
+ # Split any fully capitalized sentence (surrounding non-uppercase text remains intact)
89
+ split_line = re.split(r"([A-Z\s]+[.!?])", line)
90
+ final_output.extend([part.strip() for part in split_line if part.strip()])
91
+
92
+ return final_output
core/prompt.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SYSTEM_BOT_TEMPLATE = """
2
+ Kamu adalah Medbot, seorang ahli dalam bidang kedokteran. Tugasmu adalah memberikan jawaban yang informatif dan akurat berdasarkan tools yang tersedia dan jangan menghapus referensi atau angka dalam kurung siku, contoh [1], [2] dan sebagainya.
3
+
4
+ **Instruksi**:
5
+
6
+ 1. **Jawaban Berdasarkan Tools**: Jika pengguna bertanya tentang topik kedokteran, gunakanlah tools yang tersedia untuk memberikan jawaban. Pastikan jawabanmu relevan dan sesuai dengan informasi dari tools tersebut.
7
+
8
+ 2. **Referensi dan Kutipan**: Jangan menghapus sumber kutipan dari teks yang diberikan. Contohnya, jika teksnya adalah "Ilmu kedokteran sangat dibutuhkan [2]", pastikan untuk menyertakan kutipan sumbernya yaitu [2] dalam jawabanmu.
9
+
10
+ 3. **Ketika Tidak Tahu Jawaban**: Jika pertanyaan pengguna tidak dapat dijawab dengan menggunakan tools ini, sampaikan dengan sopan bahwa kamu tidak memiliki jawaban untuk pertanyaan tersebut. Arahkan pengguna untuk mencari informasi lebih lanjut atau bertanya pada ahli di bidang kedokteran.
11
+
12
+ 4. **Gaya Jawaban**: Berikan jawaban dengan gaya yang ramah dan profesional. Hindari penggunaan poin-poin, dan sampaikan informasi secara naratif agar lebih mudah dipahami. Gunakan kata 'dok' atau 'dokter' untuk merujuk pada dokter, dan hindari kesan monoton dengan menambahkan emotikon jika sesuai.
13
+
14
+ 5. **Penutup**: Akhiri komunikasi dengan kalimat yang friendly, seperti "Semoga informasi ini bermanfaat, dok ✨" atau "Jika ada pertanyaan lain, jangan ragu untuk bertanya ya dok 😊"
15
+ """
16
+
17
+ SYSTEM_TOPIC_TEMPLATE = """
18
+ You are tasked with analyzing a table of contents from a book. Your goal is to identify and extract the main topics and subtopics. Please provide a clear and organized list of these topics and subtopics. The list should reflect the structure and hierarchy presented in the table of contents.
19
+ """
20
+
21
+ USER_TOPIC_TEMPLATE = """
22
+
23
+ **Task:** Analyze the table of contents of a book to identify the main topics and relevant subtopics.
24
+
25
+ **Instructions:**
26
+
27
+ 1. **Main Topics:** Identify the main topics from the table of contents, excluding sections like background, preface, introduction, and references.
28
+ 2. **Subtopics:** For each main topic, list the related subtopics
29
+
30
+ **Output Format:**
31
+
32
+ 1. **Main Topic 1**
33
+ * Subtopic 1
34
+ * Subtopic 2
35
+ * etc.
36
+
37
+ 2. **Main Topic 2**
38
+ * Subtopic 1
39
+ * Subtopic 2
40
+ * etc.
41
+
42
+ **Important Guidelines:**
43
+
44
+ - Include only relevant main topics and subtopics.
45
+ - Ensure the order of topics and subtopics matches the order displayed in the table of contents.
46
+ - Use the correct format and do not include additional information beyond the main topics and subtopics.
47
+ """
48
+
49
+ REFINED_GET_TOPIC_TEMPLATE = """
50
+ Ensure the following topic and subtopic are provided:
51
+
52
+ {topics}
53
+
54
+ Follow this format :
55
+
56
+ 1. **Main topic 1**
57
+ * Subtopic 1
58
+ * Subtopic 2
59
+ * etc
60
+
61
+ 2. **Main topic 2**
62
+ * Subtopic 1
63
+ * Subtopic 2
64
+ * etc
65
+
66
+ etc
67
+
68
+ Do not add any additional text; only use the specified format.
69
+ """
70
+
71
+ ADD_METADATA_TEMPLATE = """
72
+ **Context for Adding Metadata**
73
+
74
+ {context_str}
75
+
76
+ **Context Structure:**
77
+
78
+ 1. **Main Topic 1**
79
+ * Subtopic 1
80
+ * Subtopic 2
81
+ * etc
82
+
83
+ 2. **Main Topic 2**
84
+ * Subtopic 1
85
+ * Subtopic 2
86
+ * etc
87
+
88
+ **Given:**
89
+ - **Topic and Subtopic:** {{extractor_output}}
90
+
91
+ **Role:**
92
+ Your task is to extract and organize metadata for the {class_name}. Follow the instructions below:
93
+
94
+ **Instructions:**
95
+
96
+ 1. **Extract the Main Topic:**
97
+ - **Goal:** Identify the overarching theme or subject from the provided topic and subtopic.
98
+ - **How:** Look for a theme broad enough to encompass the document's primary focus while remaining specific enough to reflect its core purpose.
99
+ - **Tip:** Ensure the main topic is concise yet descriptive, providing a clear understanding of the document’s primary theme. If the content is general or introductory (e.g., background, preface, introduction, references), categorize it accordingly.
100
+
101
+ 2. **Extract the Key Subtopic (if applicable):**
102
+ - **Goal:** Determine the most relevant supporting element related to the main topic.
103
+ - **How:** Identify a sub-element or detail that provides additional depth or clarification to the main topic.
104
+ - **Tip:** Ensure the subtopic directly supports or elaborates on the main topic.
105
+
106
+ 3. **Handle Cases Without a Clear Subtopic:**
107
+ - **Goal:** If no distinct subtopic is present, set the subtopic to mirror the main topic.
108
+ - **How:** In such cases, consider the main topic comprehensive enough to stand alone without additional subtopics.
109
+
110
+ 4. **Record the Extracted Data:**
111
+ - **Goal:** Organize and record the extracted topic and subtopic within the {class_name} class.
112
+ - **How:** Structure the entries clearly and precisely as attributes of the class.
113
+ - **Tip:** Use precise language to capture the relationship between the main topic and subtopic, ensuring clarity and ease of reference for future use.
114
+ """
115
+
116
+ SUMMARIZER_SYSTEM_TEMPLATE = """
117
+
118
+ """
119
+
120
+ SUMMARIER_HUMAN_TEMPLATE = """
121
+
122
+ """
core/summarization/__init__.py ADDED
File without changes
core/summarization/summarizer.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+ import os
3
+ import base64
4
+ import fitz
5
+
6
+ from fastapi import HTTPException
7
+ from llama_index.core.vector_stores import (
8
+ MetadataFilter,
9
+ MetadataFilters,
10
+ FilterCondition,
11
+ )
12
+
13
+ from llama_index.core import load_index_from_storage
14
+ from llama_index.core.storage import StorageContext
15
+ from llama_index.llms.openai import OpenAI
16
+ from core.parser import parse_topics_to_dict
17
+ from llama_index.core.llms import ChatMessage
18
+ from core.prompt import (
19
+ SYSTEM_TOPIC_TEMPLATE,
20
+ USER_TOPIC_TEMPLATE,
21
+ REFINED_GET_TOPIC_TEMPLATE,
22
+ )
23
+
24
+ # from langfuse.openai import openai
25
+
26
+
27
+ class SummarizeGenerator:
28
+ def __init__(self, references):
29
+
30
+ self.references = references
31
+ self.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=4096)
32
+
33
+ def extract_pages(self, content_table):
34
+ try:
35
+ content_bytes = content_table.file.read()
36
+ print(content_bytes)
37
+ # Open the PDF file
38
+ content_table = fitz.open(stream=content_bytes, filetype="pdf")
39
+ print(content_table)
40
+ # content_table = fitz.open(topics_image)
41
+ except Exception as e:
42
+ raise HTTPException(status_code=400, detail=f"Error opening PDF file: {e}")
43
+
44
+ # Initialize a list to collect base64 encoded images
45
+ pix_encoded_combined = []
46
+
47
+ # Iterate over each page to extract images
48
+ for page_number in range(len(content_table)):
49
+ try:
50
+ page = content_table.load_page(page_number)
51
+ pix_encoded = self._extract_image_as_base64(page)
52
+ pix_encoded_combined.append(pix_encoded)
53
+ # print("pix encoded combined", pix_encoded_combined)
54
+
55
+ except Exception as e:
56
+ print(f"Error processing page {page_number}: {e}")
57
+ continue # Skip to the next page if there's an error
58
+
59
+ if not pix_encoded_combined:
60
+ raise HTTPException(status_code=404, detail="No images found in the PDF")
61
+
62
+ return pix_encoded_combined
63
+
64
+ def extract_content_table(self, content_table):
65
+ try:
66
+ images = self.extract_pages(content_table)
67
+
68
+ image_messages = [
69
+ {
70
+ "type": "image_url",
71
+ "image_url": {
72
+ "url": f"data:image/jpeg;base64,{image}",
73
+ },
74
+ }
75
+ for image in images
76
+ ]
77
+
78
+ messages = [
79
+ ChatMessage(
80
+ role="system",
81
+ content=[{"type": "text", "text": SYSTEM_TOPIC_TEMPLATE}],
82
+ ),
83
+ ChatMessage(
84
+ role="user",
85
+ content=[
86
+ {"type": "text", "text": USER_TOPIC_TEMPLATE},
87
+ *image_messages,
88
+ ],
89
+ ),
90
+ ]
91
+
92
+ extractor_output = self.llm.chat(messages)
93
+ print("extractor output : ", extractor_output)
94
+ refined_extractor_output = self.llm.complete(
95
+ REFINED_GET_TOPIC_TEMPLATE.format(topics=str(extractor_output))
96
+ )
97
+
98
+ print("refined extractor output : ",str(refined_extractor_output))
99
+
100
+ extractor_dics = dict(parse_topics_to_dict(str(refined_extractor_output)))
101
+
102
+ return str(refined_extractor_output), extractor_dics
103
+
104
+ except Exception as e:
105
+ raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
106
+
107
+ def _extract_image_as_base64(self, page):
108
+ try:
109
+ pix = page.get_pixmap()
110
+ pix_bytes = pix.tobytes()
111
+ return base64.b64encode(pix_bytes).decode("utf-8")
112
+ except Exception as e:
113
+ raise HTTPException(status_code=500, detail=f"Error extracting image: {e}")
114
+
115
+ def index_summarizer_engine(self, topic, subtopic, index):
116
+ filters = MetadataFilters(
117
+ filters=[
118
+ MetadataFilter(key="title", value=topic),
119
+ MetadataFilter(key="category", value=subtopic),
120
+ ],
121
+ condition=FilterCondition.AND,
122
+ )
123
+
124
+ # Create the QueryEngineTool with the index and filters
125
+ kwargs = {"similarity_top_k": 5, "filters": filters}
126
+
127
+ query_engine = index.as_query_engine(**kwargs)
128
+
129
+ return query_engine
130
+
131
+ def get_summarizer_engine(self, topic, subtopic):
132
+ pass
133
+
134
+ def prepare_summaries(self):
135
+ pass
core/tools.py ADDED
File without changes
db/__init__.py ADDED
File without changes
db/db.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Experimental
2
+
3
+ from sqlalchemy import Column, String, Enum, ForeignKey, DateTime
4
+ from sqlalchemy.dialects.postgresql import UUID, ENUM, JSONB
5
+ from sqlalchemy.orm import relationship
6
+ from sqlalchemy.sql import func
7
+ from enum import Enum
8
+ from sqlalchemy.ext.declarative import as_declarative, declared_attr
9
+ from llama_index.core.callbacks.schema import CBEventType
10
+
11
+
12
+ # Model
13
+ @as_declarative()
14
+ class Base:
15
+ id = Column(UUID, primary_key=True, index=True, default=func.uuid_generate_v4())
16
+ created_at = Column(DateTime, server_default=func.now(), nullable=False)
17
+ updated_at = Column(
18
+ DateTime, server_default=func.now(), onupdate=func.now(), nullable=False
19
+ )
20
+
21
+ __name__: str
22
+
23
+ # Generate __tablename__ automatically
24
+ @declared_attr
25
+ def __tablename__(cls) -> str:
26
+ return cls.__name__.lower()
27
+
28
+ # DB
29
+ class MessageRoleEnum(str, Enum):
30
+ user = "user"
31
+ assistant = "assistant"
32
+
33
+
34
+ class MessageStatusEnum(str, Enum):
35
+ PENDING = "PENDING"
36
+ SUCCESS = "SUCCESS"
37
+ ERROR = "ERROR"
38
+
39
+
40
+ class MessageSubProcessStatusEnum(str, Enum):
41
+ PENDING = "PENDING"
42
+ FINISHED = "FINISHED"
43
+
44
+
45
+ # python doesn't allow enums to be extended, so we have to do this
46
+ additional_message_subprocess_fields = {
47
+ "CONSTRUCTED_QUERY_ENGINE": "constructed_query_engine",
48
+ "SUB_QUESTIONS": "sub_questions",
49
+ }
50
+ MessageSubProcessSourceEnum = Enum(
51
+ "MessageSubProcessSourceEnum",
52
+ [(event_type.name, event_type.value) for event_type in CBEventType]
53
+ + list(additional_message_subprocess_fields.items()),
54
+ )
55
+
56
+
57
+ def to_pg_enum(enum_class) -> ENUM:
58
+ return ENUM(enum_class, name=enum_class.__name__)
59
+
60
+
61
+ class Document(Base):
62
+ """
63
+ A document along with its metadata
64
+ """
65
+
66
+ # URL to the actual document (e.g. a PDF)
67
+ url = Column(String, nullable=False, unique=True)
68
+ metadata_map = Column(JSONB, nullable=True)
69
+ conversations = relationship("ConversationDocument", back_populates="document")
70
+
71
+
72
+ class Conversation(Base):
73
+ """
74
+ A conversation with messages and linked documents
75
+ """
76
+
77
+ messages = relationship("Message", back_populates="conversation")
78
+ conversation_documents = relationship(
79
+ "ConversationDocument", back_populates="conversation"
80
+ )
81
+
82
+
83
+ class ConversationDocument(Base):
84
+ """
85
+ A many-to-many relationship between a conversation and a document
86
+ """
87
+
88
+ conversation_id = Column(
89
+ UUID(as_uuid=True), ForeignKey("conversation.id"), index=True
90
+ )
91
+ document_id = Column(UUID(as_uuid=True), ForeignKey("document.id"), index=True)
92
+ conversation = relationship("Conversation", back_populates="conversation_documents")
93
+ document = relationship("Document", back_populates="conversations")
94
+
95
+
96
+ class Message(Base):
97
+ """
98
+ A message in a conversation
99
+ """
100
+
101
+ conversation_id = Column(
102
+ UUID(as_uuid=True), ForeignKey("conversation.id"), index=True
103
+ )
104
+ content = Column(String)
105
+ role = Column(to_pg_enum(MessageRoleEnum))
106
+ status = Column(to_pg_enum(MessageStatusEnum), default=MessageStatusEnum.PENDING)
107
+ conversation = relationship("Conversation", back_populates="messages")
108
+ sub_processes = relationship("MessageSubProcess", back_populates="message")
109
+
110
+
111
+ class MessageSubProcess(Base):
112
+ """
113
+ A record of a sub-process that occurred as part of the generation of a message from an AI assistant
114
+ """
115
+
116
+ message_id = Column(UUID(as_uuid=True), ForeignKey("message.id"), index=True)
117
+ source = Column(to_pg_enum(MessageSubProcessSourceEnum))
118
+ message = relationship("Message", back_populates="sub_processes")
119
+ status = Column(
120
+ to_pg_enum(MessageSubProcessStatusEnum),
121
+ default=MessageSubProcessStatusEnum.FINISHED,
122
+ nullable=False,
123
+ )
124
+ metadata_map = Column(JSONB, nullable=True)
db/delete_data.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from db.repository import Repository, get_db_conn
3
+
4
+ # Setup logging (configure as needed)
5
+ logging.basicConfig(level=logging.INFO)
6
+
7
+
8
+ class DeleteDatabase(Repository):
9
+ async def delete_record(self, params):
10
+ if "id" not in params:
11
+ raise ValueError("The 'id' parameter is required.")
12
+ query = """
13
+ DELETE FROM Metadata
14
+ WHERE id = :id
15
+ """
16
+
17
+ try:
18
+ await self._exec(query, params)
19
+ logging.info(f"Record with id {params['id']} deleted successfully.")
20
+ except Exception as e:
21
+ logging.error(f"Error deleting record with id {params['id']}: {e}")
22
+ raise
db/get_data.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from db.repository import Repository, get_db_conn
3
+
4
+ # Setup logging (configure as needed)
5
+ logging.basicConfig(level=logging.INFO)
6
+
7
+
8
+ class GetDatabase(Repository):
9
+ def __init__(self, db_conn):
10
+ super().__init__(db_conn)
11
+
12
+ async def execute_query(self, query, params=None, fetch_one=False):
13
+ """
14
+
15
+ Helper function to execute SQL queries and handle exceptions.
16
+ """
17
+ try:
18
+ print(fetch_one)
19
+ if fetch_one:
20
+
21
+ results = await self._fetch_one(query, params)
22
+ print(results)
23
+ else:
24
+ results = await self.get_by_query(query, params)
25
+ print("result execute query : ", results)
26
+ return results if results else None
27
+ except Exception as e:
28
+ logging.error(f"An error occurred while executing query: {e}")
29
+ return None
30
+
31
+ async def get_data(self, title):
32
+ """
33
+ Fetch the first result matching the given title from the metadata table.
34
+ """
35
+ query = """
36
+ SELECT * FROM Metadata
37
+ WHERE title = %s
38
+ limit 5;
39
+ """
40
+
41
+ try:
42
+ results = await self.execute_query(query, (title,), fetch_one=True)
43
+ return results
44
+ except Exception as e:
45
+ logging.error(f"An error occurred while get data: {e}")
46
+ return None
47
+
48
+ async def get_all_data(self):
49
+ """
50
+ Fetch all data from the metadata table.
51
+ """
52
+ query = """
53
+ SELECT * FROM Metadata
54
+ """
55
+ results = await self.execute_query(query)
56
+ return results
db/repository.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from databases import Database
2
+ import datetime
3
+
4
+
5
+ def get_db_conn(config):
6
+ db_url = f"{config.DB_URI}"
7
+ return Database(db_url)
8
+
9
+
10
+ class Repository:
11
+ def __init__(self, db_conn):
12
+ self.db_conn = db_conn
13
+
14
+ async def get_by_query(self, query, param):
15
+ results = await self.db_conn.fetch_all(query, param)
16
+ print("result get _by query", results)
17
+ return [dict(result) for result in results]
18
+
19
+ async def _fetch_one(self, query, param):
20
+ result = await self.db_conn.fetch_one(query, param)
21
+ return dict(result) if result is not None else result
22
+
23
+ async def _exec(self, query, param):
24
+ return await self.db_conn.execute(query, param)
25
+
26
+ async def _exec_many(self, query, params):
27
+ return await self.db_conn.execute_many(query, params)
28
+
29
+ def update_params(self, params, update=False):
30
+ current_time = datetime.datetime.now()
31
+ if update == False:
32
+
33
+ params.update({"createdAt": current_time, "updatedAt": current_time})
34
+ else:
35
+ params.update({"updatedAt": current_time})
36
+ return params
db/save_data.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from databases import Database
2
+ import logging
3
+ from dotenv import load_dotenv
4
+ from db.repository import Repository
5
+
6
+ load_dotenv()
7
+
8
+
9
+ class InsertDatabase(Repository):
10
+
11
+ # Example function to insert data asynchronously
12
+ async def insert_data(self, params):
13
+ # SQL insert query with named placeholders
14
+ query = """
15
+ INSERT INTO Metadata (title, category, author, year, publisher, createdAt, updatedAt)
16
+ VALUES (:title, :category, :author, :year, :publisher, :createdAt, :updatedAt)
17
+ """
18
+
19
+ reference = self.update_params(params)
20
+
21
+ try:
22
+ # Execute the query with the provided values
23
+ await self._exec(query, reference)
24
+ logging.info(
25
+ f"Data inserted successfully: {reference['title']}, {reference['author']}"
26
+ )
27
+ except Exception as e:
28
+ # Log any errors that occur during the database insert operation
29
+ logging.error(f"Failed to insert data: {e}")
30
+ raise # Re-raise the exception to allow further handling if needed
db/update_data.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from db.repository import Repository, get_db_conn
3
+
4
+ # Setup logging (configure as needed)
5
+ logging.basicConfig(level=logging.INFO)
6
+
7
+
8
+ class UpdateDatabase(Repository):
9
+ async def update_record(self, reference):
10
+ if "id" not in reference:
11
+ raise ValueError("The 'id' parameter is required.")
12
+ query = """
13
+ UPDATE Metadata
14
+ SET title = :title,
15
+ category = :category,
16
+ author = :author,
17
+ year = :year,
18
+ publisher = :publisher,
19
+ updatedAt = :updatedAt
20
+ WHERE id = :id
21
+ """
22
+ print(query)
23
+
24
+ updated_reference = self.update_params(reference, update=True)
25
+ print(updated_reference)
26
+
27
+ try:
28
+ await self._exec(query, updated_reference)
29
+ logging.info(
30
+ f"Record with id {updated_reference['id']} updated successfully."
31
+ )
32
+ except Exception as e:
33
+ logging.error(
34
+ f"Error updating record with id {updated_reference['id']}: {e}"
35
+ )
36
+ raise
requirements.txt ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.4.0
2
+ aiohttp==3.10.5
3
+ aiomysql==0.2.0
4
+ aiosignal==1.3.1
5
+ annotated-types==0.7.0
6
+ anyio==4.4.0
7
+ asgiref==3.8.1
8
+ attrs==24.2.0
9
+ backoff==2.2.1
10
+ bcrypt==4.2.0
11
+ beautifulsoup4==4.12.3
12
+ boto3==1.35.24
13
+ botocore== 1.35.24
14
+ build==1.2.2
15
+ cachetools==5.5.0
16
+ certifi==2024.8.30
17
+ chardet==5.2.0
18
+ charset-normalizer==3.3.2
19
+ chroma-hnswlib==0.7.6
20
+ chromadb==0.5.7
21
+ click==8.1.7
22
+ coloredlogs==15.0.1
23
+ databases==0.9.0
24
+ dataclasses-json==0.6.7
25
+ Deprecated==1.2.14
26
+ dirtyjson==1.0.8
27
+ distro==1.9.0
28
+ dnspython==1.16.0
29
+ fastapi==0.113.0
30
+ filelock==3.16.1
31
+ flatbuffers==24.3.25
32
+ frozenlist==1.4.1
33
+ fsspec==2024.9.0
34
+ google-auth==2.34.0
35
+ googleapis-common-protos==1.65.0
36
+ greenlet==3.0.3
37
+ grpcio==1.66.1
38
+ h11==0.14.0
39
+ httpcore==1.0.5
40
+ httptools==0.6.1
41
+ httpx==0.27.2
42
+ huggingface-hub==0.25.0
43
+ humanfriendly==10.0
44
+ idna==3.8
45
+ importlib_metadata==8.4.0
46
+ importlib_resources==6.4.5
47
+ Jinja2==3.1.4
48
+ jiter==0.5.0
49
+ joblib==1.4.2
50
+ jsonpatch==1.33
51
+ jsonpointer==3.0.0
52
+ kubernetes==30.1.0
53
+ langchain==0.3.0
54
+ langchain-community==0.3.0
55
+ langchain-core==0.3.1
56
+ langchain-openai==0.2.0
57
+ langchain-text-splitters==0.3.0
58
+ langchainhub==0.1.21
59
+ langfuse==2.48.1
60
+ langsmith==0.1.123
61
+ llama-cloud==0.0.17
62
+ llama-index==0.11.10
63
+ llama-index-agent-openai==0.3.1
64
+ llama-index-callbacks-langfuse==0.2.0
65
+ llama-index-cli==0.3.1
66
+ llama-index-core==0.11.10
67
+ llama-index-embeddings-openai==0.2.4
68
+ llama-index-indices-managed-llama-cloud==0.3.0
69
+ llama-index-legacy==0.9.48.post3
70
+ llama-index-llms-openai==0.2.7
71
+ llama-index-multi-modal-llms-openai==0.2.0
72
+ llama-index-program-openai==0.2.0
73
+ llama-index-question-gen-openai==0.2.0
74
+ llama-index-readers-file==0.2.1
75
+ llama-index-readers-llama-parse==0.3.0
76
+ llama-index-storage-chat-store-redis==0.2.0
77
+ llama-index-vector-stores-pinecone==0.2.1
78
+ llama-parse==0.5.2
79
+ markdown-it-py==3.0.0
80
+ MarkupSafe==2.1.5
81
+ marshmallow==3.22.0
82
+ mdurl==0.1.2
83
+ mmh3==5.0.0
84
+ monotonic==1.6
85
+ mpmath==1.3.0
86
+ multidict==6.0.5
87
+ mypy-extensions==1.0.0
88
+ mysqlclient==2.2.4
89
+ nest_asyncio==1.6.0
90
+ networkx==3.3
91
+ nltk==3.9.1
92
+ numpy==1.26.4
93
+ oauthlib==3.2.2
94
+ onnxruntime==1.19.2
95
+ openai==1.43.1
96
+ opentelemetry-api==1.27.0
97
+ opentelemetry-exporter-otlp-proto-common==1.27.0
98
+ opentelemetry-exporter-otlp-proto-grpc==1.27.0
99
+ opentelemetry-instrumentation==0.48b0
100
+ opentelemetry-instrumentation-asgi==0.48b0
101
+ opentelemetry-instrumentation-fastapi==0.48b0
102
+ opentelemetry-proto==1.27.0
103
+ opentelemetry-sdk==1.27.0
104
+ opentelemetry-semantic-conventions==0.48b0
105
+ opentelemetry-util-http==0.48b0
106
+ orjson==3.10.7
107
+ overrides==7.7.0
108
+ pandas==2.2.2
109
+ pillow==10.4.0
110
+ pinecone-client==5.0.1
111
+ pinecone-plugin-inference==1.0.3
112
+ pinecone-plugin-interface==0.0.7
113
+ posthog==3.6.6
114
+ protobuf==4.25.5
115
+ protoc-gen-openapiv2==0.0.1
116
+ pyasn1==0.6.1
117
+ pyasn1_modules==0.4.1
118
+ pydantic==2.9.0
119
+ pydantic-settings==2.4.0
120
+ pydantic_core==2.23.2
121
+ pymongo==3.11.0
122
+ PyMuPDF==1.24.10
123
+ PyMuPDFb==1.24.10
124
+ PyMySQL==1.1.1
125
+ pypdf==4.3.1
126
+ PyPDF2==3.0.1
127
+ PyPika==0.48.9
128
+ pyproject_hooks==1.1.0
129
+ pyreadline3==3.5.4
130
+ python-dotenv==1.0.1
131
+ python-multipart==0.0.9
132
+ pytz==2024.1
133
+ PyYAML==6.0.2
134
+ redis==5.0.8
135
+ regex==2024.7.24
136
+ requests==2.32.3
137
+ requests-oauthlib==2.0.0
138
+ rich==13.8.1
139
+ rsa==4.9
140
+ shellingham==1.5.4
141
+ sniffio==1.3.1
142
+ soupsieve==2.6
143
+ SQLAlchemy==2.0.34
144
+ sse-starlette==2.1.3
145
+ starlette==0.38.4
146
+ striprtf==0.0.26
147
+ sympy==1.13.3
148
+ tenacity==8.5.0
149
+ tiktoken==0.7.0
150
+ tokenizers==0.20.0
151
+ tqdm==4.66.5
152
+ typer==0.12.5
153
+ types-requests==2.32.0.20240914
154
+ typing-inspect==0.9.0
155
+ tzdata==2024.1
156
+ urllib3==2.2.2
157
+ uvicorn==0.30.6
158
+ watchfiles==0.24.0
159
+ websocket-client==1.8.0
160
+ websockets==13.0.1
161
+ wrapt==1.16.0
162
+ yarl==1.9.11
research/delete.ipynb ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "work_directory = r\"D:\\Project Multimedika\\Projek 2\\fullstack_summarizer_and_bot_development\\backend\"\n",
11
+ "os.chdir(work_directory)"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "data": {
21
+ "text/plain": [
22
+ "'D:\\\\Project Multimedika\\\\Projek 2\\\\fullstack_summarizer_and_bot_development\\\\backend'"
23
+ ]
24
+ },
25
+ "execution_count": 2,
26
+ "metadata": {},
27
+ "output_type": "execute_result"
28
+ }
29
+ ],
30
+ "source": [
31
+ "%pwd"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 3,
37
+ "metadata": {},
38
+ "outputs": [
39
+ {
40
+ "name": "stderr",
41
+ "output_type": "stream",
42
+ "text": [
43
+ "c:\\Users\\hamza\\anaconda3\\envs\\fullstack\\Lib\\site-packages\\pinecone\\data\\index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
44
+ " from tqdm.autonotebook import tqdm\n"
45
+ ]
46
+ },
47
+ {
48
+ "name": "stdout",
49
+ "output_type": "stream",
50
+ "text": [
51
+ "<class 'pinecone.core.openapi.data.model.query_response.QueryResponse'>\n",
52
+ "pinecone database deleted\n"
53
+ ]
54
+ }
55
+ ],
56
+ "source": [
57
+ "# pip install \"pinecone[grpc]\"\n",
58
+ "from pinecone.grpc import PineconeGRPC as Pinecone\n",
59
+ "from dotenv import load_dotenv\n",
60
+ "import os\n",
61
+ "import random\n",
62
+ "\n",
63
+ "load_dotenv()\n",
64
+ "\n",
65
+ "api_key = os.getenv(\"PINECONE_API_KEY\")\n",
66
+ "\n",
67
+ "pc = Pinecone(api_key=api_key)\n",
68
+ "index = pc.Index(\"summarizer-semantic-index\")\n",
69
+ "\n",
70
+ "random_vector = [random.uniform(0, 1) for _ in range(1536)]\n",
71
+ "results = index.query(\n",
72
+ " vector=random_vector,\n",
73
+ " top_k=10000,\n",
74
+ " filter={\n",
75
+ " \"category\": {\"$eq\": \"Artificial Intelligence\"},\n",
76
+ " },\n",
77
+ ")\n",
78
+ "\n",
79
+ "ids = set()\n",
80
+ "print(type(results))\n",
81
+ "for result in results['matches']:\n",
82
+ " ids.add(result['id'])\n",
83
+ " \n",
84
+ "index.delete(ids=ids)\n",
85
+ "print(\"pinecone database deleted\")"
86
+ ]
87
+ }
88
+ ],
89
+ "metadata": {
90
+ "kernelspec": {
91
+ "display_name": "fullstack",
92
+ "language": "python",
93
+ "name": "python3"
94
+ },
95
+ "language_info": {
96
+ "codemirror_mode": {
97
+ "name": "ipython",
98
+ "version": 3
99
+ },
100
+ "file_extension": ".py",
101
+ "mimetype": "text/x-python",
102
+ "name": "python",
103
+ "nbconvert_exporter": "python",
104
+ "pygments_lexer": "ipython3",
105
+ "version": "3.11.9"
106
+ }
107
+ },
108
+ "nbformat": 4,
109
+ "nbformat_minor": 2
110
+ }
research/summarizer.ipynb ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from langchain.chains.summarize import load_summarize_chain\n",
10
+ "from langchain_community.document_loaders import WebBaseLoader\n",
11
+ "from langchain_openai import ChatOpenAI\n",
12
+ "\n",
13
+ "loader = WebBaseLoader(\"https://lilianweng.github.io/posts/2023-06-23-agent/\")\n",
14
+ "docs = loader.load()\n",
15
+ "\n",
16
+ "llm = ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo-1106\")\n",
17
+ "chain = load_summarize_chain(llm, chain_type=\"stuff\")\n",
18
+ "\n",
19
+ "chain.run(docs)"
20
+ ]
21
+ }
22
+ ],
23
+ "metadata": {
24
+ "kernelspec": {
25
+ "display_name": "fullstack",
26
+ "language": "python",
27
+ "name": "python3"
28
+ },
29
+ "language_info": {
30
+ "name": "python",
31
+ "version": "3.11.9"
32
+ }
33
+ },
34
+ "nbformat": 4,
35
+ "nbformat_minor": 2
36
+ }
research/test_mongodb.ipynb ADDED
File without changes
research/test_table.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ | Location | Date | Average Temperature (°C) | Maximum Temperature (°C) | Minimum Temperature (°C) |
2
+ |----------------|------------|--------------------------|--------------------------|--------------------------|
3
+ | New York | 2024-09-20 | 22 | 25 | 19 |
4
+ | Los Angeles | 2024-09-20 | 26 | 29 | 23 |
5
+ | Chicago | 2024-09-20 | 20 | 23 | 17 |
6
+ | Miami | 2024-09-20 | 28 | 31 | 25 |
script/__init__.py ADDED
File without changes
script/build_vector.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.core import VectorStoreIndex
2
+ from llama_index.core import StorageContext
3
+ from pinecone import Pinecone, ServerlessSpec
4
+ from llama_index.llms.openai import OpenAI
5
+ from llama_index.vector_stores.pinecone import PineconeVectorStore
6
+ from fastapi import HTTPException, status
7
+ from config import PINECONE_CONFIG
8
+ import os
9
+ import json
10
+
11
+
12
+ class IndexManager:
13
+ def __init__(self):
14
+ self.vector_index = None
15
+ self.index_name = "summarizer-semantic-index"
16
+
17
+ def _get_pinecone_client(self):
18
+ """Initialize and return the Pinecone client."""
19
+ # api_key = os.getenv("PINECONE_API_KEY")
20
+ api_key = PINECONE_CONFIG.PINECONE_API_KEY
21
+ if not api_key:
22
+ raise ValueError(
23
+ "Pinecone API key is missing. Please set it in environment variables."
24
+ )
25
+ return Pinecone(api_key=api_key)
26
+
27
+ def _create_pinecone_index(self, client):
28
+ """Create Pinecone index if it doesn't already exist."""
29
+ if self.index_name not in client.list_indexes().names():
30
+ client.create_index(
31
+ name=self.index_name,
32
+ dimension=1536,
33
+ metric="cosine",
34
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
35
+ )
36
+ return client.Index(self.index_name)
37
+
38
+ def _initialize_vector_store(self, pinecone_index):
39
+ """Initialize and return the vector store with the Pinecone index."""
40
+ vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
41
+ return StorageContext.from_defaults(vector_store=vector_store)
42
+
43
+
44
+ def build_indexes(self, nodes):
45
+ """Build vector and tree indexes from nodes."""
46
+ try:
47
+ client = self._get_pinecone_client()
48
+ pinecone_index = self._create_pinecone_index(client)
49
+ storage_context = self._initialize_vector_store(pinecone_index)
50
+
51
+ self.vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
52
+ self.vector_index.set_index_id("vector")
53
+
54
+ print(f"Vector Index ID: {self.vector_index.index_id}")
55
+ print("Vector Index created successfully.")
56
+
57
+ response = {
58
+ "status": "success",
59
+ "message": "Existing Vector Index loaded successfully.",
60
+ }
61
+
62
+ return json.dumps(response)
63
+ except HTTPException as http_exc:
64
+ raise http_exc # Re-raise HTTPExceptions to ensure FastAPI handles them
65
+ except Exception as e:
66
+ raise HTTPException(
67
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
68
+ detail=f"Error loading existing indexes: {str(e)}"
69
+ )
70
+
71
+ def load_existing_indexes(self):
72
+ """Load existing indexes from Pinecone."""
73
+ try:
74
+ client = self._get_pinecone_client()
75
+ pinecone_index = client.Index(self.index_name)
76
+ print(pinecone_index.describe_index_stats())
77
+ vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
78
+ retriever = VectorStoreIndex.from_vector_store(vector_store)
79
+
80
+ print("Existing Vector Index loaded successfully.")
81
+ return retriever
82
+ except Exception as e:
83
+ print(f"Error loading existing indexes: {e}")
84
+ raise
script/document_uploader.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.core.ingestion import IngestionPipeline
2
+ from llama_index.core.extractors import PydanticProgramExtractor
3
+ from llama_index.embeddings.openai import OpenAIEmbedding
4
+ from config import PINECONE_CONFIG
5
+ from pinecone.grpc import PineconeGRPC as Pinecone
6
+ from service.reader import Reader
7
+ from script.get_metadata import Metadata
8
+ from fastapi import UploadFile, HTTPException,status
9
+
10
+ from llama_index.core.node_parser import (
11
+ SentenceSplitter,
12
+ SemanticSplitterNodeParser,
13
+ )
14
+
15
+ # from script.get_topic import extract_topic
16
+
17
+ import logging
18
+ import random
19
+
20
+
21
+ class Uploader:
22
+ # def __init__(self, reference, file: UploadFile, content_table: UploadFile):
23
+ def __init__(self, reference, file: UploadFile):
24
+ self.file = file
25
+ # self.content_table = content_table
26
+ self.reader = Reader()
27
+ self.reference = reference
28
+ self.metadata = Metadata(reference)
29
+
30
+ async def ingest_documents(self, file: UploadFile):
31
+ """Load documents from the storage path."""
32
+ documents = await self.reader.read_from_uploadfile(file)
33
+ print("document successfully ingested")
34
+
35
+ return documents
36
+
37
+ def check_existing_metadata(self, pinecone_index, title, random_vector):
38
+ try:
39
+ result = pinecone_index.query(
40
+ vector=random_vector,
41
+ top_k=1,
42
+ filter={
43
+ "title": {"$eq": title},
44
+ },
45
+ )
46
+ return result["matches"]
47
+ except Exception as e:
48
+ raise HTTPException(
49
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
50
+ detail=f"Error check existing metadata {str(e)}",
51
+ )
52
+
53
+ async def process_documents(self):
54
+ # Ingest documents
55
+ print("test")
56
+ documents = await self.ingest_documents(self.file)
57
+ print("documents")
58
+
59
+ # topic_extractor = extract_topic(self.reference, self.content_table)
60
+
61
+ embed_model = OpenAIEmbedding()
62
+
63
+ # Get metadata
64
+ documents_with_metadata = self.metadata.apply_metadata(documents)
65
+ print("documents_with_metadata")
66
+
67
+ # document_filtered = self.filter_document(documents_with_metadata)
68
+
69
+ # Set up the ingestion pipeline
70
+ # pipeline = IngestionPipeline(
71
+ # transformations=[
72
+ # SemanticSplitterNodeParser(
73
+ # buffer_size=1,
74
+ # breakpoint_percentile_threshold=95,
75
+ # embed_model=embed_model,
76
+ # ),
77
+ # # topic_extractor,
78
+ # ]
79
+ # )
80
+
81
+ splitter = SemanticSplitterNodeParser(
82
+ buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
83
+ )
84
+
85
+ # Run the pipeline
86
+ try:
87
+ # nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
88
+ nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
89
+ print("Pipeline processing completed updated.")
90
+ return nodes_with_metadata
91
+ except Exception as e:
92
+ # Log the error and raise HTTPException for FastAPI
93
+ logging.error(f"An error occurred in making pipeline: {e}")
94
+ raise HTTPException(
95
+ status_code=500,
96
+ detail="An internal server error occurred making pipeline.",
97
+ )
98
+
99
+ def filter_document(self, documents):
100
+ api_key = PINECONE_CONFIG.PINECONE_API_KEY
101
+ client = Pinecone(api_key=api_key)
102
+ pinecone_index = client.Index("test")
103
+
104
+ random_vector = [random.uniform(0, 1) for _ in range(1536)]
105
+
106
+ filtered_documents = []
107
+ for doc in documents:
108
+ result = self.check_existing_metadata(
109
+ pinecone_index, doc.metadata["title"], random_vector
110
+ )
111
+
112
+ if len(result) == 0:
113
+ filtered_documents.append(doc)
114
+
115
+ return filtered_documents
script/get_metadata.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Get reference
2
+
3
+
4
+ class Metadata:
5
+ def __init__(self, reference):
6
+ self.title = reference["title"]
7
+ self.author = reference["author"]
8
+ self.category = reference["category"]
9
+ self.year = reference["year"]
10
+ self.publisher = reference["publisher"]
11
+
12
+ def add_metadata(self, documents, metadata):
13
+ """Add metadata to each item (document or node)."""
14
+ for document in documents:
15
+ if not hasattr(document, "metadata") or document.metadata is None:
16
+ document.metadata = {}
17
+ document.metadata.update(metadata)
18
+ print("metadata is added")
19
+ # self.logger.log_action(f"Metadata added to document {item.id_}", action_type="METADATA")
20
+
21
+ return documents
22
+
23
+ def _generate_metadata(self):
24
+ """Generate metadata and return it."""
25
+ metadata = {
26
+ "title": self.title,
27
+ "author": self.author,
28
+ "category": self.category,
29
+ "year": self.year,
30
+ "publisher": self.publisher,
31
+ "reference": f"{self.author}. ({self.year}). *{self.title}*. {self.publisher}.", # APA style reference
32
+ }
33
+ print("metadata is generated")
34
+ return metadata
35
+
36
+ def apply_metadata(self, documents):
37
+ """Apply generated metadata to documents."""
38
+ metadata = self._generate_metadata()
39
+ print("metadata is applied")
40
+ return self.add_metadata(documents, metadata)
script/get_topic.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nest_asyncio
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from jinja2 import Template
5
+ from pydantic import BaseModel, Field
6
+ from pymongo.mongo_client import MongoClient
7
+
8
+ from llama_index.program.openai import OpenAIPydanticProgram
9
+ from llama_index.core.extractors import PydanticProgramExtractor
10
+ from llama_index.llms.openai import OpenAI
11
+
12
+ from core.prompt import ADD_METADATA_TEMPLATE
13
+ from core.summarization.summarizer import SummarizeGenerator
14
+
15
+ nest_asyncio.apply()
16
+
17
+ load_dotenv()
18
+
19
+
20
+ class NodeMetadata(BaseModel):
21
+ """Metadata for nodes, capturing topic and subtopic from the book."""
22
+
23
+ topic: str = Field(
24
+ ...,
25
+ description="The main subject or category that the node is associated with, representing a broad theme within the book.",
26
+ )
27
+ subtopic: str = Field(
28
+ ...,
29
+ description="A more specific aspect or section under the main topic, refining the context of the node within the book.",
30
+ )
31
+
32
+
33
+ def extract_topic(references, content_table):
34
+ uri = os.getenv("MONGO_URI")
35
+ client = MongoClient(uri)
36
+
37
+ try:
38
+ client.admin.command('ping')
39
+ print("Pinged your deployment. You successfully connected to MongoDB!")
40
+ except Exception as e:
41
+ print(e)
42
+ # Access a specific database
43
+ db = client["summarizer"]
44
+
45
+ # Access a collection within the database
46
+ collection = db["topic_collection"]
47
+
48
+ generate_content_table = SummarizeGenerator(references)
49
+ extractor_output, extractor_dics = generate_content_table.extract_content_table(content_table)
50
+ print(extractor_output)
51
+ data_to_insert = {
52
+ "title": references["title"],
53
+ **extractor_dics # Unpack the extractor_output dictionary
54
+ }
55
+
56
+ collection.insert_one(data_to_insert)
57
+
58
+
59
+ add_metadata_template = str(
60
+ Template(ADD_METADATA_TEMPLATE).render(extractor_output=extractor_output)
61
+ )
62
+
63
+ print("add metadata template : ", add_metadata_template)
64
+
65
+ llm = OpenAI(temperature=0.1, model="gpt-4o-mini")
66
+
67
+ openai_program = OpenAIPydanticProgram.from_defaults(
68
+ output_cls=NodeMetadata,
69
+ prompt_template_str="{input}",
70
+ extract_template_str=add_metadata_template,
71
+ llm=llm,
72
+ )
73
+
74
+ topic_extractor = PydanticProgramExtractor(
75
+ program=openai_program,
76
+ input_key="input",
77
+ show_progress=True,
78
+ extract_template_str=add_metadata_template,
79
+ llm=llm,
80
+ )
81
+
82
+ return topic_extractor
service/__init__.py ADDED
File without changes
service/aws_loader.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import boto3
3
+ import tempfile
4
+ import fitz
5
+ from io import BytesIO
6
+
7
+ from fastapi import HTTPException
8
+
9
+
10
+ class Loader:
11
+ def __init__(self):
12
+ # Create S3 and Transcribe clients with credentials
13
+ self.bucket_name = "multimedika"
14
+ self.s3_client = boto3.client(
15
+ "s3",
16
+ aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
17
+ aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
18
+ region_name="us-west-2",
19
+ )
20
+
21
+ def upload_to_s3(self, file, object_name, folder_name="summarizer"):
22
+ try:
23
+ # If folder_name is provided, prepend it to the object_name
24
+ if folder_name:
25
+ object_name = f"{folder_name}/{object_name}"
26
+
27
+ # Create an in-memory file-like object
28
+ with BytesIO() as file_stream:
29
+ # Write the contents of the uploaded file to the stream
30
+ file_stream.write(file.file.read())
31
+ file_stream.seek(0) # Move to the beginning of the stream
32
+
33
+ # Upload file to S3
34
+ self.s3_client.upload_fileobj(file_stream, self.bucket_name, object_name)
35
+
36
+ print(f"File '{object_name}' successfully uploaded to bucket '{self.bucket_name}'.")
37
+ except Exception as e:
38
+ raise HTTPException(status_code=400, detail=f"Error uploading to AWS: {e}")
39
+
40
+ def get_file_aws(self, object_name, local_file_name=None):
41
+ """Downloads a PDF file from S3 and reads it using PyMuPDF."""
42
+ if local_file_name is None:
43
+ local_file_name = "downloaded_pdf_file.pdf" # Default file name
44
+
45
+ try:
46
+ # Create a temporary directory to store the file
47
+ temp_dir = tempfile.mkdtemp()
48
+ file_path = os.path.join(temp_dir, local_file_name)
49
+ # Download the file from S3
50
+ with open(file_path, "wb") as temp_file:
51
+ self.s3_client.download_fileobj(
52
+ self.bucket_name, object_name, temp_file
53
+ )
54
+ # Open and read the PDF using PyMuPDF
55
+ doc = fitz.open(file_path)
56
+ # Example: Print the number of pages
57
+ print(f"Number of pages: {doc.page_count}")
58
+ # Do something with the PDF, like read text
59
+ for page in doc:
60
+ print(page.get_text())
61
+ # Close the document
62
+ doc.close()
63
+ # Clean up the downloaded file if needed
64
+ os.remove(file_path)
65
+
66
+ except Exception as e:
67
+ raise HTTPException(status_code=400, detail=f"Error get file file in aws: {e}")