Spaces:

Multimedika
/

Bot_Development

Sleeping

App Files Files Community

dsmultimedika commited on Sep 24, 2024

Commit

9002555

1 Parent(s): b3ea108

Build Application

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +400 -0
Dockerfile +14 -0
api/__init__.py +0 -0
api/auth.py +0 -0
api/events.py +26 -0
api/function.py +239 -0
api/router/__init__.py +0 -0
api/router/bot.py +47 -0
api/router/health.py +10 -0
api/router/role.py +20 -0
api/router/topic.py +50 -0
api/router/trial.py +20 -0
api/router/user.py +20 -0
api/util/util.py +0 -0
app.py +41 -0
config.py +37 -0
core/__init__.py +0 -0
core/book_enabler/__init__.py +0 -0
core/chat/__init__.py +0 -0
core/chat/engine.py +130 -0
core/chat/messaging.py +63 -0
core/chat/schema.py +162 -0
core/journal_reading/__init__.py +0 -0
core/journal_reading/extractor.py +0 -0
core/journal_reading/upload.py +30 -0
core/module_creator/__init__.py +0 -0
core/parser.py +92 -0
core/prompt.py +122 -0
core/summarization/__init__.py +0 -0
core/summarization/summarizer.py +135 -0
core/tools.py +0 -0
db/__init__.py +0 -0
db/db.py +124 -0
db/delete_data.py +22 -0
db/get_data.py +56 -0
db/repository.py +36 -0
db/save_data.py +30 -0
db/update_data.py +36 -0
requirements.txt +162 -0
research/delete.ipynb +110 -0
research/summarizer.ipynb +36 -0
research/test_mongodb.ipynb +0 -0
research/test_table.md +6 -0
script/__init__.py +0 -0
script/build_vector.py +84 -0
script/document_uploader.py +115 -0
script/get_metadata.py +40 -0
script/get_topic.py +82 -0
service/__init__.py +0 -0
service/aws_loader.py +67 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,400 @@

+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.tlog
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio 6 auto-generated project file (contains which files were open etc.)
+*.vbp
+# Visual Studio 6 workspace and project file (working project files containing files to include in project)
+*.dsw
+*.dsp
+# Visual Studio 6 technical files
+*.ncb
+*.aps
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# Visual Studio History (VSHistory) files
+.vshistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+# VS Code files for those working on multiple tools
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+# Windows Installer files from build outputs
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# JetBrains Rider
+*.sln.iml
+.env

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+# Use the official Python image from Docker Hub
+FROM python:3.11
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

api/__init__.py ADDED Viewed

File without changes

api/auth.py ADDED Viewed

File without changes

api/events.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from fastapi import FastAPI
+from api.router.topic import db_conn
+from llama_index.core import set_global_handler
+import os
+from dotenv import load_dotenv
+load_dotenv()
+async def startup() -> None:
+    await db_conn.connect()
+    os.environ["LANGFUSE_SECRET_KEY"] = os.getenv("LANGFUSE_SECRET_KEY")
+    os.environ["LANGFUSE_PUBLIC_KEY"] = os.getenv("LANGFUSE_PUBLIC_KEY")
+    os.environ["LANGFUSE_HOST"] = os.getenv("LANGFUSE_HOST")
+    set_global_handler("langfuse")
+async def shutdown() -> None:
+    await db_conn.disconnect()
+def register_events(app: FastAPI) -> FastAPI:
+    app.add_event_handler("startup", startup)
+    app.add_event_handler("shutdown", shutdown)
+    return app

api/function.py ADDED Viewed

	@@ -0,0 +1,239 @@

+from script.build_vector import IndexManager
+from script.document_uploader import Uploader
+from db.save_data import InsertDatabase
+from db.get_data import GetDatabase
+from db.delete_data import DeleteDatabase
+from db.update_data import UpdateDatabase
+from typing import Any
+from fastapi import UploadFile
+from fastapi import HTTPException
+from core.chat.engine import Engine
+from core.parser import clean_text, update_response, renumber_sources, seperate_to_list
+from llama_index.core.composability import QASummaryQueryEngineBuilder
+from service.dto import BotResponseStreaming, TestStreaming
+from service.aws_loader import Loader
+import logging
+import re
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+# async def data_ingestion(
+#     db_conn, reference, file: UploadFile, content_table: UploadFile
+# ) -> Any:
+async def data_ingestion(
+    db_conn, reference, file: UploadFile
+) -> Any:
+    insert_database = InsertDatabase(db_conn)
+    file_name = f"{reference['title']}.pdf"
+    aws_loader = Loader()
+    file_obj = file
+    aws_loader.upload_to_s3(file_obj, file_name)
+    print("Uploaded Success")
+    try:
+        # Insert data into the database
+        await insert_database.insert_data(reference)
+        # uploader = Uploader(reference, file, content_table)
+        uploader = Uploader(reference, file)
+        print("uploader : ", uploader)
+        nodes_with_metadata = await uploader.process_documents()
+        # Build indexes using IndexManager
+        index = IndexManager()
+        response = index.build_indexes(nodes_with_metadata)
+        return response
+    except Exception as e:
+        # Log the error and raise HTTPException for FastAPI
+        logging.error(f"An error occurred in data ingestion: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail="An internal server error occurred in data ingestion.",
+        )
+async def get_data(db_conn, title="", fetch_all_data=True):
+    get_database = GetDatabase(db_conn)
+    print(get_database)
+    try:
+        if fetch_all_data:
+            results = await get_database.get_all_data()
+            print(results)
+            logging.info("Database fetched all data")
+            return results
+        else:
+            results = await get_database.get_data(title)
+            logging.info("Database fetched one data")
+            return results
+    except Exception as e:
+        # Log the error and raise HTTPException for FastAPI
+        logging.error(f"An error occurred in get data.: {e}")
+        raise HTTPException(
+            status_code=500, detail="An internal server error occurred in get data."
+        )
+async def update_data(id: int, reference, db_conn):
+    update_database = UpdateDatabase(db_conn)
+    try:
+        reference = reference.model_dump()
+        print(reference)
+        reference.update({"id": id})
+        print(reference)
+        await update_database.update_record(reference)
+        response = {"status": "Update Success"}
+        return response
+    except Exception as e:
+        # Log the error and raise HTTPException for FastAPI
+        logging.error(f"An error occurred in update data.: {e}")
+        raise HTTPException(
+            status_code=500, detail="An internal server error occurred in update data."
+        )
+async def delete_data(id: int, db_conn):
+    delete_database = DeleteDatabase(db_conn)
+    try:
+        params = {"id": id}
+        await delete_database.delete_record(params)
+        response = {"status": "Delete Success"}
+        return response
+    except Exception as e:
+        # Log the error and raise HTTPException for FastAPI
+        logging.error(f"An error occurred in get data.: {e}")
+        raise HTTPException(
+            status_code=500, detail="An internal server error occurred in delete data."
+        )
+def generate_completion_non_streaming(user_request, chat_engine):
+    try:
+        engine = Engine()
+        index_manager = IndexManager()
+        # Load existing indexes
+        index = index_manager.load_existing_indexes()
+        # Retrieve the chat engine with the loaded index
+        chat_engine = engine.get_chat_engine(index)
+        # Generate completion response
+        response = chat_engine.chat(user_request)
+        sources = response.sources
+        number_reference = list(set(re.findall(r"\[(\d+)\]", str(response))))
+        number_reference_sorted = sorted(number_reference)
+        contents = []
+        raw_contents = []
+        metadata_collection = []
+        scores = []
+        if number_reference_sorted:
+            for number in number_reference_sorted:
+                # Konversi number ke integer untuk digunakan sebagai indeks
+                number = int(number)
+                # Pastikan sources tidak kosong dan memiliki elemen yang diperlukan
+                if sources and len(sources) > 0:
+                    node = dict(sources[0])["raw_output"].source_nodes
+                    # Pastikan number valid sebagai indeks
+                    if 0 <= number - 1 < len(node):
+                        print(node[number - 1].node.get_text())
+                        raw_content = seperate_to_list(
+                            node[number - 1].node.get_text()
+                        )
+                        raw_contents.append(raw_content)
+                        content = clean_text(node[number - 1].node.get_text())
+                        contents.append(content)
+                        metadata = dict(node[number - 1].node.metadata)
+                        metadata_collection.append(metadata)
+                        score = node[number - 1].score
+                        scores.append(score)
+                    else:
+                        print(f"Invalid reference number: {number}")
+                else:
+                    print("No sources available")
+        else:
+            print("There are no references")
+        response = update_response(str(response))
+        contents = renumber_sources(contents)
+        # Check the lengths of content and metadata
+        num_content = len(contents)
+        num_metadata = len(metadata_collection)
+        # Add content to metadata
+        for i in range(min(num_content, num_metadata)):
+            metadata_collection[i]["content"] = re.sub(r"source \d+\:", "", contents[i])
+        return str(response), raw_contents, contents, metadata_collection, scores
+    except Exception as e:
+        # Log the error and raise HTTPException for FastAPI
+        logging.error(f"An error occurred in generate text: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail="An internal server error occurred in generate text.",
+        )
+async def generate_streaming_completion(user_request, chat_engine):
+    try:
+        engine = Engine()
+        index_manager = IndexManager()
+        # Load existing indexes
+        index = index_manager.load_existing_indexes()
+        # Retrieve the chat engine with the loaded index
+        chat_engine = engine.get_chat_engine(index)
+        # Generate completion response
+        response = chat_engine.stream_chat(user_request)
+        completed_response = ""
+        for gen in response.response_gen:
+            completed_response += gen  # Concatenate the new string
+            yield BotResponseStreaming(
+                content=gen, completed_content=completed_response
+            )
+        nodes = response.source_nodes
+        for node in nodes:
+            reference = str(clean_text(node.node.get_text()))
+            metadata = dict(node.node.metadata)
+            score = float(node.score)
+            yield BotResponseStreaming(
+                completed_content=completed_response,
+                reference=reference,
+                metadata=metadata,
+                score=score,
+            )
+    except Exception as e:
+        yield {"error": str(e)}
+    except Exception as e:
+        # Log the error and raise HTTPException for FastAPI
+        logging.error(f"An error occurred in generate text: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail="An internal server error occurred in generate text.",
+        )

api/router/__init__.py ADDED Viewed

File without changes

api/router/bot.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from fastapi import APIRouter
+from service.dto import UserPromptRequest, BotResponse
+from api.function import (
+    generate_streaming_completion,
+    generate_completion_non_streaming,
+)
+from sse_starlette.sse import EventSourceResponse
+router = APIRouter(tags=["Bot"])
+@router.post("/bot")
+async def bot_generator_general(user_prompt_request: UserPromptRequest):
+    if user_prompt_request.streaming:
+        return EventSourceResponse(
+            generate_streaming_completion(
+                user_prompt_request.prompt, user_prompt_request.streaming
+            )
+        )
+    else:
+        response, raw_references, references, metadata, scores = (
+            generate_completion_non_streaming(
+                user_prompt_request.prompt, user_prompt_request.streaming
+            )
+        )
+        return BotResponse(
+            content=response,
+            raw_references=raw_references,
+            references=references,
+            metadata=metadata,
+            scores=scores,
+        )
+@router.post("/bot/{category_id}/{title}")
+async def bot_generator_spesific(
+    category_id: int, title: str, user_prompt_request: UserPromptRequest
+):
+    pass
+@router.get("/bot/{category_id}/{title}")
+async def get_favourite_data(category_id: int, title: str, human_template):
+    pass

api/router/health.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from fastapi import Request
+from fastapi.responses import JSONResponse
+from fastapi.routing import APIRouter
+router = APIRouter(tags=["Health"])
+@router.get("/_health")
+async def health(request: Request):
+    return JSONResponse(dict(status="OK"), status_code=200)

api/router/role.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from fastapi import APIRouter
+router = APIRouter(tags=["Roles"])
+@router.get("/roles")
+async def get_data_roles():
+    pass
+@router.post("/roles")
+async def add_data_roles():
+    pass
+@router.put("/roles/{id}")
+async def update_data_roles():
+    pass
+@router.delete("/roles/{id}")
+async def remove_data_roles():
+    pass

api/router/topic.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from fastapi import Form, APIRouter, File, UploadFile, HTTPException, Request
+from db.repository import get_db_conn
+from config import MYSQL_CONFIG
+from api.function import data_ingestion, get_data, delete_data, update_data
+from service.dto import MetadataRequest
+router = APIRouter(tags=["Topics"])
+db_conn = get_db_conn(MYSQL_CONFIG)
+@router.post("/topic")
+async def upload_file(
+    title: str = Form(...),
+    author: str = Form(...),
+    category: str = Form(...),
+    year: int = Form(...),
+    publisher: str = Form(...),
+    file: UploadFile = File(...),
+    # content_table: UploadFile = File(...)
+):
+    reference = {
+        "title": title,
+        "author": author,
+        "category": category,
+        "year": year,
+        "publisher": publisher,
+    }
+    # response = await data_ingestion(db_conn, reference, file, content_table)
+    response = await data_ingestion(db_conn, reference, file)
+    return {"filename": file.filename, "response": response}
+@router.get("/topic")
+async def get_metadata():
+    results = await get_data(db_conn)
+    return results
+@router.put("/topic/{id}")
+async def update_metadata(id: int, reference: MetadataRequest):
+    response = await update_data(id, reference, db_conn)
+    return response
+@router.delete("/topic/{id}")
+async def delete_metadata(id: int):
+    response = await delete_data(id, db_conn)
+    return response

api/router/trial.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from fastapi import APIRouter
+router = APIRouter(tags=["Trial"])
+@router.get("/roles")
+async def get_trial_data():
+    pass
+@router.post("/roles")
+async def add_trial_data():
+    pass
+@router.put("/roles/{id}")
+async def update_trial_data():
+    pass
+@router.delete("/roles/{id}")
+async def remove_trial_data():
+    pass

api/router/user.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from fastapi import APIRouter
+router = APIRouter(tags=["User"])
+@router.post("/login")
+async def get_data_roles():
+    pass
+@router.post("/register")
+async def register_user():
+    pass
+@router.post("/forgot_password")
+async def forget_password():
+    pass
+@router.post("/change_password")
+async def change_password():
+    pass

api/util/util.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from fastapi.applications import FastAPI
+from api.router import health, topic, user, bot, trial, role
+from fastapi.middleware.cors import CORSMiddleware
+from api.events import register_events
+from utils.utils import pipe
+def create_instance() -> FastAPI:
+    return FastAPI()
+def add_middleware(app: FastAPI) -> FastAPI:
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    return app
+def init_database(app: FastAPI) -> FastAPI:
+    return app
+def register_routers(app: FastAPI) -> FastAPI:
+    app.include_router(user.router)
+    app.include_router(topic.router)
+    app.include_router(bot.router)
+    app.include_router(trial.router)
+    app.include_router(role.router)
+    app.include_router(health.router)
+    return app
+def init_app() -> FastAPI:
+    app: FastAPI = pipe(
+        create_instance(), add_middleware, init_database, register_events, register_routers
+    )
+    return app
+app = init_app()

config.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from pydantic_settings import BaseSettings
+import os
+class MysqlConfig(BaseSettings):
+    DB_HOST: str = ""
+    DB_PORT: str = "10707"  # Default MySQL port
+    DB_URI: str = ""
+    DB_USERNAME: str = ""
+    DB_PASSWORD: str = ""
+    DB_NAME: str = ""
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+        extra = "allow"  # Allow extra fields
+class PineconeConfig(BaseSettings):
+    PINECONE_API_KEY: str = ""
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+        extra = "allow"  # Allow extra fields
+class GPTBotConfig(BaseSettings):
+    temperature : float = 0.3
+    model : str = "gpt-4o-mini"
+    max_tokens : int = 512
+    streaming : bool = False
+    api_key : str = os.environ.get("OPENAI_API_KEY")
+# Load configuration
+MYSQL_CONFIG = MysqlConfig()
+PINECONE_CONFIG = PineconeConfig()
+GPTBOT_CONFIG = GPTBotConfig()

core/__init__.py ADDED Viewed

File without changes

core/book_enabler/__init__.py ADDED Viewed

File without changes

core/chat/__init__.py ADDED Viewed

File without changes

core/chat/engine.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from typing import Any
+from llama_index.core.vector_stores import (
+    MetadataFilter,
+    MetadataFilters,
+    FilterCondition,
+)
+from llama_index.core.memory import ChatMemoryBuffer
+from llama_index.core.tools import QueryEngineTool, ToolMetadata
+from llama_index.agent.openai import OpenAIAgent
+from llama_index.llms.openai import OpenAI
+from llama_index.storage.chat_store.redis import RedisChatStore
+from llama_index.core.storage.chat_store import SimpleChatStore
+from llama_index.core.memory import ChatMemoryBuffer
+from llama_index.core.query_engine import CitationQueryEngine
+from llama_index.core import Settings
+from config import GPTBOT_CONFIG
+from core.prompt import SYSTEM_BOT_TEMPLATE
+import redis
+import os
+class Engine:
+    def __init__(self):
+        self.llm = OpenAI(
+            temperature=GPTBOT_CONFIG.temperature,
+            model=GPTBOT_CONFIG.model,
+            max_tokens=GPTBOT_CONFIG.max_tokens,
+            api_key=GPTBOT_CONFIG.api_key,
+        )
+        Settings.llm = self.llm
+    def initialize_memory_bot(self, user_id = "1"):
+        redis_client = redis.Redis(
+            host="redis-10365.c244.us-east-1-2.ec2.redns.redis-cloud.com",
+            port=10365,
+            password=os.environ.get("REDIS_PASSWORD"),
+        )
+        # chat_store = SimpleChatStore()
+        chat_store = RedisChatStore(redis_client=redis_client, ttl=3600)  # Need to be configured
+        memory = ChatMemoryBuffer.from_defaults(
+            token_limit=3000, chat_store=chat_store, chat_store_key=user_id
+        )
+        return memory
+    def _build_description_bot(self, title, category):
+        try:
+            prompt = f"Write a detailed description for an OpenAI agent with the title '{title}' and categorized under '{category}'."
+            description = self.llm.complete(prompt)
+            return description
+        except Exception as e:
+            return f"Error generating description: {str(e)}"
+    def index_to_query_engine(self, title, category, index):
+        filters = MetadataFilters(
+            filters=[
+                MetadataFilter(key="title", value=title),
+                MetadataFilter(key="category", value=category),
+            ],
+            condition=FilterCondition.AND,
+        )
+        # Create the QueryEngineTool with the index and filters
+        kwargs = {"similarity_top_k": 5, "filters": filters}
+        query_engine = index.as_query_engine(**kwargs)
+        return query_engine
+    def get_citation_engine(self, title, category, index):
+        filters = MetadataFilters(
+            filters=[
+                MetadataFilter(key="title", value=title),
+                MetadataFilter(key="category", value=category),
+            ],
+            condition=FilterCondition.AND,
+        )
+        # Create the QueryEngineTool with the index and filters
+        kwargs = {"similarity_top_k": 5, "filters": filters}
+        retriever = index.as_retriever(**kwargs)
+        citation_engine = CitationQueryEngine(retriever=retriever)
+        return citation_engine
+    def get_chat_engine(self, index, title=None, category=None, type="general"):
+        # Define the metadata for the QueryEngineTool
+        # Create the QueryEngineTool based on the type
+        if type == "general":
+            # query_engine = index.as_query_engine(similarity_top_k=3)
+            citation_engine = CitationQueryEngine.from_args(index, similarity_top_k=5)
+            description = "A book containing information about medicine"
+        else:
+            query_engine = self.index_to_query_engine(title, category, index)
+            citation_engine = self.get_citation_engine(title, category, index)
+            description = self._build_description_bot()
+        metadata = ToolMetadata(
+            name="bot-belajar",
+            description=description
+        )
+        print(metadata)
+        vector_query_engine = QueryEngineTool(
+            query_engine=citation_engine,
+            metadata=metadata
+        )
+        print(vector_query_engine)
+        # Initialize the OpenAI agent with the tools
+        chat_engine = OpenAIAgent.from_tools(
+            tools=[vector_query_engine],
+            llm=self.llm,
+            memory=self.initialize_memory_bot(),
+            system_prompt=SYSTEM_BOT_TEMPLATE,
+        )
+        return chat_engine
+    def get_chat_history(self):
+        pass

core/chat/messaging.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Experimental
+from typing import Dict, Any, Optional, List
+import asyncio
+import logging
+from uuid import uuid4
+from anyio import ClosedResourceError
+from anyio.streams.memory import MemoryObjectSendStream
+from llama_index.core.callbacks.base import BaseCallbackHandler, CallbackManager
+from llama_index.core.callbacks import CBEventType, EventPayload
+from llama_index.core.query_engine.sub_question_query_engine import (
+    SubQuestionAnswerPair,
+)
+from llama_index.core.chat_engine.types import StreamingAgentChatResponse
+from pydantic import BaseModel
+from core.chat import schema
+from db.db import MessageSubProcessSourceEnum
+from core.chat.schema import SubProcessMetadataKeysEnum, SubProcessMetadataMap
+from core.chat.engine import Engine
+from script.build_vector import IndexManager
+from service.dto import UserPromptRequest
+logger = logging.getLogger(__name__)
+class StreamedMessage(BaseModel):
+    content: str
+async def handle_chat_message(
+    user_message: str,
+    send_chan: MemoryObjectSendStream,
+) -> None:
+    async with send_chan:
+        engine = Engine()
+        index_manager = IndexManager()
+        index = index_manager.load_existing_indexes()
+        # Retrieve the chat engine with the loaded index
+        chat_engine = await engine.get_chat_engine(index)
+        logger.debug("Engine received")
+        streaming_chat_response: StreamingAgentChatResponse = (
+            await chat_engine.astream_chat(user_message)
+        )
+        response_str = ""
+        async for text in streaming_chat_response.async_response_gen():
+            response_str += text
+            if send_chan._closed:
+                logger.debug(
+                    "Received streamed token after send channel closed. Ignoring."
+                )
+                return
+            await send_chan.send(StreamedMessage(content=response_str))
+        if response_str.strip() == "":
+            await send_chan.send(
+                StreamedMessage(
+                    content="Sorry, I either wasn't able to understand your question or I don't have an answer for it."
+                )
+            )

core/chat/schema.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Experimental
+from pydantic import BaseModel, Field, field_validator
+from typing import List, Optional, Dict, Union, Any
+from enum import Enum
+from uuid import UUID
+from datetime import datetime
+from llama_index.core.schema import BaseNode, NodeWithScore
+from llama_index.core.callbacks.schema import EventPayload
+from llama_index.core.query_engine.sub_question_query_engine import SubQuestionAnswerPair
+from db.db import (
+    MessageRoleEnum,
+    MessageStatusEnum,
+    MessageSubProcessSourceEnum,
+    MessageSubProcessStatusEnum,
+)
+DB_DOC_ID_KEY = "db_document_id"
+class Base(BaseModel):
+    id: Optional[UUID] = Field(None, description="Unique identifier")
+    created_at: Optional[datetime] = Field(None, description="Creation datetime")
+    updated_at: Optional[datetime] = Field(None, description="Update datetime")
+    class Config:
+        orm_mode = True
+class BaseMetadataObject(BaseModel):
+    class Config:
+        orm_mode = True
+class Citation(BaseMetadataObject):
+    document_id: UUID
+    text: str
+    page_number: int
+    score: Optional[float]
+    @field_validator("document_id")
+    def validate_document_id(cls, value):
+        if value:
+            return str(value)
+        return value
+    @classmethod
+    def from_node(cls, node_w_score: NodeWithScore) -> "Citation":
+        node: BaseNode = node_w_score.node
+        page_number = int(node.source_node.metadata["page_label"])
+        document_id = node.source_node.metadata[""]
+        return cls(
+            document_id=document_id,
+            text=node.get_content(),
+            page_number=page_number,
+            score=node_w_score.score,
+        )
+class QuestionAnswerPair(BaseMetadataObject):
+    """
+    A question-answer pair that is used to store the sub-questions and answers
+    """
+    question: str
+    answer: Optional[str]
+    citations: Optional[List[Citation]] = None
+    @classmethod
+    def from_sub_question_answer_pair(
+        cls, sub_question_answer_pair: SubQuestionAnswerPair
+    ):
+        if sub_question_answer_pair.sources is None:
+            citations = None
+        else:
+            citations = [
+                Citation.from_node(node_w_score)
+                for node_w_score in sub_question_answer_pair.sources
+                if node_w_score.node.source_node is not None
+                and DB_DOC_ID_KEY in node_w_score.node.source_node.metadata
+            ]
+        citations = citations or None
+        return cls(
+            question=sub_question_answer_pair.sub_q.sub_question,
+            answer=sub_question_answer_pair.answer,
+            citations=citations,
+        )
+# later will be Union[QuestionAnswerPair, more to add later... ]
+class SubProcessMetadataKeysEnum(str, Enum):
+    SUB_QUESTION = EventPayload.SUB_QUESTION.value
+# keeping the typing pretty loose here, in case there are changes to the metadata data formats.
+SubProcessMetadataMap = Dict[Union[SubProcessMetadataKeysEnum, str], Any]
+class MessageSubProcess(Base):
+    message_id: UUID
+    source: MessageSubProcessSourceEnum
+    status: MessageSubProcessStatusEnum
+    metadata_map: Optional[SubProcessMetadataMap]
+class Message(Base):
+    conversation_id: UUID
+    content: str
+    role: MessageRoleEnum
+    status: MessageStatusEnum
+    sub_processes: List[MessageSubProcess]
+class UserMessageCreate(BaseModel):
+    content: str
+class DocumentMetadataKeysEnum(str, Enum):
+    """
+    Enum for the keys of the metadata map for a document
+    """
+    SEC_DOCUMENT = "sec_document"
+class SecDocumentTypeEnum(str, Enum):
+    """
+    Enum for the type of sec document
+    """
+    TEN_K = "10-K"
+    TEN_Q = "10-Q"
+class SecDocumentMetadata(BaseModel):
+    """
+    Metadata for a document that is a sec document
+    """
+    company_name: str
+    company_ticker: str
+    doc_type: SecDocumentTypeEnum
+    year: int
+    quarter: Optional[int]
+    accession_number: Optional[str]
+    cik: Optional[str]
+    period_of_report_date: Optional[datetime]
+    filed_as_of_date: Optional[datetime]
+    date_as_of_change: Optional[datetime]
+DocumentMetadataMap = Dict[Union[DocumentMetadataKeysEnum, str], Any]
+class Document(Base):
+    url: str
+    metadata_map: Optional[DocumentMetadataMap] = None
+class Conversation(Base):
+    messages: List[Message]
+    documents: List[Document]
+class ConversationCreate(BaseModel):
+    document_ids: List[UUID]

core/journal_reading/__init__.py ADDED Viewed

File without changes

core/journal_reading/extractor.py ADDED Viewed

File without changes

core/journal_reading/upload.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import tempfile
+import os
+from llama_parse import LlamaParse
+from llama_index.core.node_parser import SimpleNodeParser
+class JournalUploader:
+    def __init__(self):
+        pass
+    def parser_journal(self):
+        if local_file_name is None:
+            local_file_name = "downloaded_pdf_file.pdf"  # Default file name
+        try:
+            # Create a temporary directory to store the file
+            temp_dir = tempfile.mkdtemp()
+            file_path = os.path.join(temp_dir, local_file_name)
+            with open(file_path, 'wb') as temp_file:
+                self.s3_client.download_fileobj(self.bucket_name, object_name, temp_file)
+            documents = LlamaParse(result_type="markdown").load_data(file_path)
+            return documents
+        except Exception as e:
+            # Handle specific exceptions or fallback to generic one
+            print(f"Error reading PDF file: {e}")
+            raise RuntimeError(f"Failed to process the uploaded file: {e}")

core/module_creator/__init__.py ADDED Viewed

File without changes

core/parser.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import re
+def parse_topics_to_dict(text):
+    topics = {}
+    lines = text.strip().split("\n")
+    current_topic = None
+    topic_pattern = re.compile(r"^\d+\.\s+(.*)$")
+    sub_topic_pattern = re.compile(r"^\*\s+(.*)$")
+    for line in lines:
+        line = line.strip()
+        if topic_pattern.match(line):
+            current_topic = topic_pattern.match(line).group(1)
+            topics[current_topic] = []
+        elif sub_topic_pattern.match(line):
+            sub_topic = sub_topic_pattern.match(line).group(1)
+            if current_topic:
+                topics[current_topic].append(sub_topic)
+    print(topics)
+    return topics
+def remove_all_sources(text):
+    # Construct a regular expression pattern to match all sources
+    pattern = r"Source \d+:(.*?)(?=Source \d+:|$)"
+    # Use re.DOTALL to make '.' match newlines and re.IGNORECASE for case-insensitive matching
+    updated_text = re.sub(pattern, "", text, flags=re.DOTALL)
+    return updated_text.strip()
+def clean_text(text):
+    # Replace multiple spaces with a single space
+    text = re.sub(r"\s{2,}", " ", text)
+    # Remove newline characters that are not followed by a number (to keep lists or numbered points)
+    text = re.sub(r"\n(?!\s*\d)", " ", text)
+    # Remove unnecessary punctuation (optional, adjust as needed)
+    text = re.sub(r";(?=\S)", "", text)
+    # Optional: Remove extra spaces around certain characters
+    text = re.sub(r"\s*([,;])\s*", r"\1 ", text)
+    # Normalize whitespace to a single space
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def update_response(text):
+    # Find all the references in the text, e.g., [1], [3], [5]
+    responses = re.findall(r"\[\d+\]", text)
+    # Extract the numbers from the responses, and remove duplicates
+    ref_numbers = sorted(set(int(respon.strip("[]")) for respon in responses))
+    # Create a mapping from old reference numbers to new ones
+    ref_mapping = {old: new for new, old in enumerate(ref_numbers, start=1)}
+    # Replace old responses with the updated responses in the text
+    for old, new in ref_mapping.items():
+        text = re.sub(rf"\[{old}\]", f"[{new}]", text)
+    return text
+def renumber_sources(source_list):
+    new_sources = []
+    for i, source in enumerate(source_list):
+        # Extract the content after the colon
+        content = source.split(": ", 1)[1]
+        # Add the new source number and content
+        new_sources.append(f"source {i+1}: {content}")
+    return new_sources
+def seperate_to_list(text):
+    # Step 1: Split the text by line breaks (\n)
+    lines = text.split("\n")
+    # Step 2: Remove occurrences of "source (number):"
+    cleaned_lines = [re.sub(r"Source \d+\:", "", line) for line in lines]
+    # Step 3: Split all capital sentences
+    final_output = []
+    for line in cleaned_lines:
+        # Split any fully capitalized sentence (surrounding non-uppercase text remains intact)
+        split_line = re.split(r"([A-Z\s]+[.!?])", line)
+        final_output.extend([part.strip() for part in split_line if part.strip()])
+    return final_output

core/prompt.py ADDED Viewed

	@@ -0,0 +1,122 @@

+SYSTEM_BOT_TEMPLATE = """
+Kamu adalah Medbot, seorang ahli dalam bidang kedokteran. Tugasmu adalah memberikan jawaban yang informatif dan akurat berdasarkan tools yang tersedia dan jangan menghapus referensi atau angka dalam kurung siku, contoh [1], [2] dan sebagainya.
+**Instruksi**:
+ 1. **Jawaban Berdasarkan Tools**: Jika pengguna bertanya tentang topik kedokteran, gunakanlah tools yang tersedia untuk memberikan jawaban. Pastikan jawabanmu relevan dan sesuai dengan informasi dari tools tersebut.
+ 2. **Referensi dan Kutipan**: Jangan menghapus sumber kutipan dari teks yang diberikan. Contohnya, jika teksnya adalah "Ilmu kedokteran sangat dibutuhkan [2]", pastikan untuk menyertakan kutipan sumbernya yaitu [2] dalam jawabanmu.
+ 3. **Ketika Tidak Tahu Jawaban**: Jika pertanyaan pengguna tidak dapat dijawab dengan menggunakan tools ini, sampaikan dengan sopan bahwa kamu tidak memiliki jawaban untuk pertanyaan tersebut. Arahkan pengguna untuk mencari informasi lebih lanjut atau bertanya pada ahli di bidang kedokteran.
+ 4. **Gaya Jawaban**: Berikan jawaban dengan gaya yang ramah dan profesional. Hindari penggunaan poin-poin, dan sampaikan informasi secara naratif agar lebih mudah dipahami. Gunakan kata 'dok' atau 'dokter' untuk merujuk pada dokter, dan hindari kesan monoton dengan menambahkan emotikon jika sesuai.
+ 5. **Penutup**: Akhiri komunikasi dengan kalimat yang friendly, seperti "Semoga informasi ini bermanfaat, dok ✨" atau "Jika ada pertanyaan lain, jangan ragu untuk bertanya ya dok 😊"
+"""
+SYSTEM_TOPIC_TEMPLATE = """
+You are tasked with analyzing a table of contents from a book. Your goal is to identify and extract the main topics and subtopics. Please provide a clear and organized list of these topics and subtopics. The list should reflect the structure and hierarchy presented in the table of contents.
+"""
+USER_TOPIC_TEMPLATE = """
+**Task:** Analyze the table of contents of a book to identify the main topics and relevant subtopics.
+**Instructions:**
+1. **Main Topics:** Identify the main topics from the table of contents, excluding sections like background, preface, introduction, and references.
+2. **Subtopics:** For each main topic, list the related subtopics
+**Output Format:**
+1. **Main Topic 1**
+   * Subtopic 1
+   * Subtopic 2
+   * etc.
+2. **Main Topic 2**
+   * Subtopic 1
+   * Subtopic 2
+   * etc.
+**Important Guidelines:**
+- Include only relevant main topics and subtopics.
+- Ensure the order of topics and subtopics matches the order displayed in the table of contents.
+- Use the correct format and do not include additional information beyond the main topics and subtopics.
+"""
+REFINED_GET_TOPIC_TEMPLATE = """
+Ensure the following topic and subtopic are provided:
+{topics}
+Follow this format :
+1. **Main topic 1**
+   * Subtopic 1
+   * Subtopic 2
+   * etc
+2. **Main topic 2**
+   * Subtopic 1
+   * Subtopic 2
+   * etc
+etc
+Do not add any additional text; only use the specified format.
+"""
+ADD_METADATA_TEMPLATE = """
+**Context for Adding Metadata**
+{context_str}
+**Context Structure:**
+1. **Main Topic 1**
+  * Subtopic 1
+  * Subtopic 2
+  * etc
+2. **Main Topic 2**
+  * Subtopic 1
+  * Subtopic 2
+  * etc
+**Given:**
+- **Topic and Subtopic:** {{extractor_output}}
+**Role:**
+Your task is to extract and organize metadata for the {class_name}. Follow the instructions below:
+**Instructions:**
+1. **Extract the Main Topic:**
+   - **Goal:** Identify the overarching theme or subject from the provided topic and subtopic.
+   - **How:** Look for a theme broad enough to encompass the document's primary focus while remaining specific enough to reflect its core purpose.
+   - **Tip:** Ensure the main topic is concise yet descriptive, providing a clear understanding of the document’s primary theme. If the content is general or introductory (e.g., background, preface, introduction, references), categorize it accordingly.
+2. **Extract the Key Subtopic (if applicable):**
+   - **Goal:** Determine the most relevant supporting element related to the main topic.
+   - **How:** Identify a sub-element or detail that provides additional depth or clarification to the main topic.
+   - **Tip:** Ensure the subtopic directly supports or elaborates on the main topic.
+3. **Handle Cases Without a Clear Subtopic:**
+   - **Goal:** If no distinct subtopic is present, set the subtopic to mirror the main topic.
+   - **How:** In such cases, consider the main topic comprehensive enough to stand alone without additional subtopics.
+4. **Record the Extracted Data:**
+   - **Goal:** Organize and record the extracted topic and subtopic within the {class_name} class.
+   - **How:** Structure the entries clearly and precisely as attributes of the class.
+   - **Tip:** Use precise language to capture the relationship between the main topic and subtopic, ensuring clarity and ease of reference for future use.
+"""
+SUMMARIZER_SYSTEM_TEMPLATE = """
+"""
+SUMMARIER_HUMAN_TEMPLATE = """
+"""

core/summarization/__init__.py ADDED Viewed

File without changes

core/summarization/summarizer.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from io import BytesIO
+import os
+import base64
+import fitz
+from fastapi import HTTPException
+from llama_index.core.vector_stores import (
+    MetadataFilter,
+    MetadataFilters,
+    FilterCondition,
+)
+from llama_index.core import load_index_from_storage
+from llama_index.core.storage import StorageContext
+from llama_index.llms.openai import OpenAI
+from core.parser import parse_topics_to_dict
+from llama_index.core.llms import ChatMessage
+from core.prompt import (
+    SYSTEM_TOPIC_TEMPLATE,
+    USER_TOPIC_TEMPLATE,
+    REFINED_GET_TOPIC_TEMPLATE,
+)
+# from langfuse.openai import openai
+class SummarizeGenerator:
+    def __init__(self, references):
+        self.references = references
+        self.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=4096)
+    def extract_pages(self, content_table):
+        try:
+            content_bytes = content_table.file.read()
+            print(content_bytes)
+            # Open the PDF file
+            content_table = fitz.open(stream=content_bytes, filetype="pdf")
+            print(content_table)
+            # content_table = fitz.open(topics_image)
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Error opening PDF file: {e}")
+        # Initialize a list to collect base64 encoded images
+        pix_encoded_combined = []
+        # Iterate over each page to extract images
+        for page_number in range(len(content_table)):
+            try:
+                page = content_table.load_page(page_number)
+                pix_encoded = self._extract_image_as_base64(page)
+                pix_encoded_combined.append(pix_encoded)
+                # print("pix encoded combined", pix_encoded_combined)
+            except Exception as e:
+                print(f"Error processing page {page_number}: {e}")
+                continue  # Skip to the next page if there's an error
+        if not pix_encoded_combined:
+            raise HTTPException(status_code=404, detail="No images found in the PDF")
+        return pix_encoded_combined
+    def extract_content_table(self, content_table):
+        try:
+            images = self.extract_pages(content_table)
+            image_messages = [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{image}",
+                    },
+                }
+                for image in images
+            ]
+            messages = [
+                ChatMessage(
+                    role="system",
+                    content=[{"type": "text", "text": SYSTEM_TOPIC_TEMPLATE}],
+                ),
+                ChatMessage(
+                    role="user",
+                    content=[
+                        {"type": "text", "text": USER_TOPIC_TEMPLATE},
+                        *image_messages,
+                    ],
+                ),
+            ]
+            extractor_output = self.llm.chat(messages)
+            print("extractor output : ", extractor_output)
+            refined_extractor_output = self.llm.complete(
+                REFINED_GET_TOPIC_TEMPLATE.format(topics=str(extractor_output))
+            )
+            print("refined extractor output : ",str(refined_extractor_output))
+            extractor_dics = dict(parse_topics_to_dict(str(refined_extractor_output)))
+            return str(refined_extractor_output), extractor_dics
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
+    def _extract_image_as_base64(self, page):
+        try:
+            pix = page.get_pixmap()
+            pix_bytes = pix.tobytes()
+            return base64.b64encode(pix_bytes).decode("utf-8")
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Error extracting image: {e}")
+    def index_summarizer_engine(self, topic, subtopic, index):
+        filters = MetadataFilters(
+            filters=[
+                MetadataFilter(key="title", value=topic),
+                MetadataFilter(key="category", value=subtopic),
+            ],
+            condition=FilterCondition.AND,
+        )
+        # Create the QueryEngineTool with the index and filters
+        kwargs = {"similarity_top_k": 5, "filters": filters}
+        query_engine = index.as_query_engine(**kwargs)
+        return query_engine
+    def get_summarizer_engine(self, topic, subtopic):
+        pass
+    def prepare_summaries(self):
+        pass

core/tools.py ADDED Viewed

File without changes

db/__init__.py ADDED Viewed

File without changes

db/db.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Experimental
+from sqlalchemy import Column, String, Enum, ForeignKey, DateTime
+from sqlalchemy.dialects.postgresql import UUID, ENUM, JSONB
+from sqlalchemy.orm import relationship
+from sqlalchemy.sql import func
+from enum import Enum
+from sqlalchemy.ext.declarative import as_declarative, declared_attr
+from llama_index.core.callbacks.schema import CBEventType
+# Model
+@as_declarative()
+class Base:
+    id = Column(UUID, primary_key=True, index=True, default=func.uuid_generate_v4())
+    created_at = Column(DateTime, server_default=func.now(), nullable=False)
+    updated_at = Column(
+        DateTime, server_default=func.now(), onupdate=func.now(), nullable=False
+    )
+    __name__: str
+    # Generate __tablename__ automatically
+    @declared_attr
+    def __tablename__(cls) -> str:
+        return cls.__name__.lower()
+# DB
+class MessageRoleEnum(str, Enum):
+    user = "user"
+    assistant = "assistant"
+class MessageStatusEnum(str, Enum):
+    PENDING = "PENDING"
+    SUCCESS = "SUCCESS"
+    ERROR = "ERROR"
+class MessageSubProcessStatusEnum(str, Enum):
+    PENDING = "PENDING"
+    FINISHED = "FINISHED"
+# python doesn't allow enums to be extended, so we have to do this
+additional_message_subprocess_fields = {
+    "CONSTRUCTED_QUERY_ENGINE": "constructed_query_engine",
+    "SUB_QUESTIONS": "sub_questions",
+}
+MessageSubProcessSourceEnum = Enum(
+    "MessageSubProcessSourceEnum",
+    [(event_type.name, event_type.value) for event_type in CBEventType]
+    + list(additional_message_subprocess_fields.items()),
+)
+def to_pg_enum(enum_class) -> ENUM:
+    return ENUM(enum_class, name=enum_class.__name__)
+class Document(Base):
+    """
+    A document along with its metadata
+    """
+    # URL to the actual document (e.g. a PDF)
+    url = Column(String, nullable=False, unique=True)
+    metadata_map = Column(JSONB, nullable=True)
+    conversations = relationship("ConversationDocument", back_populates="document")
+class Conversation(Base):
+    """
+    A conversation with messages and linked documents
+    """
+    messages = relationship("Message", back_populates="conversation")
+    conversation_documents = relationship(
+        "ConversationDocument", back_populates="conversation"
+    )
+class ConversationDocument(Base):
+    """
+    A many-to-many relationship between a conversation and a document
+    """
+    conversation_id = Column(
+        UUID(as_uuid=True), ForeignKey("conversation.id"), index=True
+    )
+    document_id = Column(UUID(as_uuid=True), ForeignKey("document.id"), index=True)
+    conversation = relationship("Conversation", back_populates="conversation_documents")
+    document = relationship("Document", back_populates="conversations")
+class Message(Base):
+    """
+    A message in a conversation
+    """
+    conversation_id = Column(
+        UUID(as_uuid=True), ForeignKey("conversation.id"), index=True
+    )
+    content = Column(String)
+    role = Column(to_pg_enum(MessageRoleEnum))
+    status = Column(to_pg_enum(MessageStatusEnum), default=MessageStatusEnum.PENDING)
+    conversation = relationship("Conversation", back_populates="messages")
+    sub_processes = relationship("MessageSubProcess", back_populates="message")
+class MessageSubProcess(Base):
+    """
+    A record of a sub-process that occurred as part of the generation of a message from an AI assistant
+    """
+    message_id = Column(UUID(as_uuid=True), ForeignKey("message.id"), index=True)
+    source = Column(to_pg_enum(MessageSubProcessSourceEnum))
+    message = relationship("Message", back_populates="sub_processes")
+    status = Column(
+        to_pg_enum(MessageSubProcessStatusEnum),
+        default=MessageSubProcessStatusEnum.FINISHED,
+        nullable=False,
+    )
+    metadata_map = Column(JSONB, nullable=True)

db/delete_data.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import logging
+from db.repository import Repository, get_db_conn
+# Setup logging (configure as needed)
+logging.basicConfig(level=logging.INFO)
+class DeleteDatabase(Repository):
+    async def delete_record(self, params):
+        if "id" not in params:
+            raise ValueError("The 'id' parameter is required.")
+        query = """
+            DELETE FROM Metadata
+            WHERE id = :id
+        """
+        try:
+            await self._exec(query, params)
+            logging.info(f"Record with id {params['id']} deleted successfully.")
+        except Exception as e:
+            logging.error(f"Error deleting record with id {params['id']}: {e}")
+            raise

db/get_data.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import logging
+from db.repository import Repository, get_db_conn
+# Setup logging (configure as needed)
+logging.basicConfig(level=logging.INFO)
+class GetDatabase(Repository):
+    def __init__(self, db_conn):
+        super().__init__(db_conn)
+    async def execute_query(self, query, params=None, fetch_one=False):
+        """
+        Helper function to execute SQL queries and handle exceptions.
+        """
+        try:
+            print(fetch_one)
+            if fetch_one:
+                results = await self._fetch_one(query, params)
+                print(results)
+            else:
+                results = await self.get_by_query(query, params)
+                print("result execute query : ", results)
+            return results if results else None
+        except Exception as e:
+            logging.error(f"An error occurred while executing query: {e}")
+            return None
+    async def get_data(self, title):
+        """
+        Fetch the first result matching the given title from the metadata table.
+        """
+        query = """
+        SELECT * FROM Metadata
+        WHERE title = %s
+        limit 5;
+        """
+        try:
+            results = await self.execute_query(query, (title,), fetch_one=True)
+            return results
+        except Exception as e:
+            logging.error(f"An error occurred while get data: {e}")
+            return None
+    async def get_all_data(self):
+        """
+        Fetch all data from the metadata table.
+        """
+        query = """
+        SELECT * FROM Metadata
+        """
+        results = await self.execute_query(query)
+        return results

db/repository.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from databases import Database
+import datetime
+def get_db_conn(config):
+    db_url = f"{config.DB_URI}"
+    return Database(db_url)
+class Repository:
+    def __init__(self, db_conn):
+        self.db_conn = db_conn
+    async def get_by_query(self, query, param):
+        results = await self.db_conn.fetch_all(query, param)
+        print("result get _by query", results)
+        return [dict(result) for result in results]
+    async def _fetch_one(self, query, param):
+        result = await self.db_conn.fetch_one(query, param)
+        return dict(result) if result is not None else result
+    async def _exec(self, query, param):
+        return await self.db_conn.execute(query, param)
+    async def _exec_many(self, query, params):
+        return await self.db_conn.execute_many(query, params)
+    def update_params(self, params, update=False):
+        current_time = datetime.datetime.now()
+        if update == False:
+            params.update({"createdAt": current_time, "updatedAt": current_time})
+        else:
+            params.update({"updatedAt": current_time})
+        return params

db/save_data.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from databases import Database
+import logging
+from dotenv import load_dotenv
+from db.repository import Repository
+load_dotenv()
+class InsertDatabase(Repository):
+    # Example function to insert data asynchronously
+    async def insert_data(self, params):
+        # SQL insert query with named placeholders
+        query = """
+        INSERT INTO Metadata (title, category, author, year, publisher, createdAt, updatedAt)
+        VALUES (:title, :category, :author, :year, :publisher, :createdAt, :updatedAt)
+        """
+        reference = self.update_params(params)
+        try:
+            # Execute the query with the provided values
+            await self._exec(query, reference)
+            logging.info(
+                f"Data inserted successfully: {reference['title']}, {reference['author']}"
+            )
+        except Exception as e:
+            # Log any errors that occur during the database insert operation
+            logging.error(f"Failed to insert data: {e}")
+            raise  # Re-raise the exception to allow further handling if needed

db/update_data.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import logging
+from db.repository import Repository, get_db_conn
+# Setup logging (configure as needed)
+logging.basicConfig(level=logging.INFO)
+class UpdateDatabase(Repository):
+    async def update_record(self, reference):
+        if "id" not in reference:
+            raise ValueError("The 'id' parameter is required.")
+        query = """
+            UPDATE Metadata
+            SET title = :title,
+                category = :category,
+                author = :author,
+                year = :year,
+                publisher = :publisher,
+                updatedAt = :updatedAt
+            WHERE id = :id
+        """
+        print(query)
+        updated_reference = self.update_params(reference, update=True)
+        print(updated_reference)
+        try:
+            await self._exec(query, updated_reference)
+            logging.info(
+                f"Record with id {updated_reference['id']} updated successfully."
+            )
+        except Exception as e:
+            logging.error(
+                f"Error updating  record with id {updated_reference['id']}: {e}"
+            )
+            raise

requirements.txt ADDED Viewed

	@@ -0,0 +1,162 @@

+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiomysql==0.2.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.4.0
+asgiref==3.8.1
+attrs==24.2.0
+backoff==2.2.1
+bcrypt==4.2.0
+beautifulsoup4==4.12.3
+boto3==1.35.24
+botocore== 1.35.24
+build==1.2.2
+cachetools==5.5.0
+certifi==2024.8.30
+chardet==5.2.0
+charset-normalizer==3.3.2
+chroma-hnswlib==0.7.6
+chromadb==0.5.7
+click==8.1.7
+coloredlogs==15.0.1
+databases==0.9.0
+dataclasses-json==0.6.7
+Deprecated==1.2.14
+dirtyjson==1.0.8
+distro==1.9.0
+dnspython==1.16.0
+fastapi==0.113.0
+filelock==3.16.1
+flatbuffers==24.3.25
+frozenlist==1.4.1
+fsspec==2024.9.0
+google-auth==2.34.0
+googleapis-common-protos==1.65.0
+greenlet==3.0.3
+grpcio==1.66.1
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.2
+huggingface-hub==0.25.0
+humanfriendly==10.0
+idna==3.8
+importlib_metadata==8.4.0
+importlib_resources==6.4.5
+Jinja2==3.1.4
+jiter==0.5.0
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+kubernetes==30.1.0
+langchain==0.3.0
+langchain-community==0.3.0
+langchain-core==0.3.1
+langchain-openai==0.2.0
+langchain-text-splitters==0.3.0
+langchainhub==0.1.21
+langfuse==2.48.1
+langsmith==0.1.123
+llama-cloud==0.0.17
+llama-index==0.11.10
+llama-index-agent-openai==0.3.1
+llama-index-callbacks-langfuse==0.2.0
+llama-index-cli==0.3.1
+llama-index-core==0.11.10
+llama-index-embeddings-openai==0.2.4
+llama-index-indices-managed-llama-cloud==0.3.0
+llama-index-legacy==0.9.48.post3
+llama-index-llms-openai==0.2.7
+llama-index-multi-modal-llms-openai==0.2.0
+llama-index-program-openai==0.2.0
+llama-index-question-gen-openai==0.2.0
+llama-index-readers-file==0.2.1
+llama-index-readers-llama-parse==0.3.0
+llama-index-storage-chat-store-redis==0.2.0
+llama-index-vector-stores-pinecone==0.2.1
+llama-parse==0.5.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.22.0
+mdurl==0.1.2
+mmh3==5.0.0
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.0.5
+mypy-extensions==1.0.0
+mysqlclient==2.2.4
+nest_asyncio==1.6.0
+networkx==3.3
+nltk==3.9.1
+numpy==1.26.4
+oauthlib==3.2.2
+onnxruntime==1.19.2
+openai==1.43.1
+opentelemetry-api==1.27.0
+opentelemetry-exporter-otlp-proto-common==1.27.0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-instrumentation==0.48b0
+opentelemetry-instrumentation-asgi==0.48b0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-proto==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-semantic-conventions==0.48b0
+opentelemetry-util-http==0.48b0
+orjson==3.10.7
+overrides==7.7.0
+pandas==2.2.2
+pillow==10.4.0
+pinecone-client==5.0.1
+pinecone-plugin-inference==1.0.3
+pinecone-plugin-interface==0.0.7
+posthog==3.6.6
+protobuf==4.25.5
+protoc-gen-openapiv2==0.0.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pydantic==2.9.0
+pydantic-settings==2.4.0
+pydantic_core==2.23.2
+pymongo==3.11.0
+PyMuPDF==1.24.10
+PyMuPDFb==1.24.10
+PyMySQL==1.1.1
+pypdf==4.3.1
+PyPDF2==3.0.1
+PyPika==0.48.9
+pyproject_hooks==1.1.0
+pyreadline3==3.5.4
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.2
+redis==5.0.8
+regex==2024.7.24
+requests==2.32.3
+requests-oauthlib==2.0.0
+rich==13.8.1
+rsa==4.9
+shellingham==1.5.4
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==2.0.34
+sse-starlette==2.1.3
+starlette==0.38.4
+striprtf==0.0.26
+sympy==1.13.3
+tenacity==8.5.0
+tiktoken==0.7.0
+tokenizers==0.20.0
+tqdm==4.66.5
+typer==0.12.5
+types-requests==2.32.0.20240914
+typing-inspect==0.9.0
+tzdata==2024.1
+urllib3==2.2.2
+uvicorn==0.30.6
+watchfiles==0.24.0
+websocket-client==1.8.0
+websockets==13.0.1
+wrapt==1.16.0
+yarl==1.9.11

research/delete.ipynb ADDED Viewed

	@@ -0,0 +1,110 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "work_directory = r\"D:\\Project Multimedika\\Projek 2\\fullstack_summarizer_and_bot_development\\backend\"\n",
+    "os.chdir(work_directory)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'D:\\\\Project Multimedika\\\\Projek 2\\\\fullstack_summarizer_and_bot_development\\\\backend'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\hamza\\anaconda3\\envs\\fullstack\\Lib\\site-packages\\pinecone\\data\\index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from tqdm.autonotebook import tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pinecone.core.openapi.data.model.query_response.QueryResponse'>\n",
+      "pinecone database deleted\n"
+     ]
+    }
+   ],
+   "source": [
+    "# pip install \"pinecone[grpc]\"\n",
+    "from pinecone.grpc import PineconeGRPC as Pinecone\n",
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "import random\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "api_key = os.getenv(\"PINECONE_API_KEY\")\n",
+    "\n",
+    "pc = Pinecone(api_key=api_key)\n",
+    "index = pc.Index(\"summarizer-semantic-index\")\n",
+    "\n",
+    "random_vector = [random.uniform(0, 1) for _ in range(1536)]\n",
+    "results = index.query(\n",
+    "    vector=random_vector,\n",
+    "    top_k=10000,\n",
+    "    filter={\n",
+    "        \"category\": {\"$eq\": \"Artificial Intelligence\"},\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "ids = set()\n",
+    "print(type(results))\n",
+    "for result in results['matches']:\n",
+    "    ids.add(result['id'])\n",
+    "    \n",
+    "index.delete(ids=ids)\n",
+    "print(\"pinecone database deleted\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "fullstack",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

research/summarizer.ipynb ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chains.summarize import load_summarize_chain\n",
+    "from langchain_community.document_loaders import WebBaseLoader\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "loader = WebBaseLoader(\"https://lilianweng.github.io/posts/2023-06-23-agent/\")\n",
+    "docs = loader.load()\n",
+    "\n",
+    "llm = ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo-1106\")\n",
+    "chain = load_summarize_chain(llm, chain_type=\"stuff\")\n",
+    "\n",
+    "chain.run(docs)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "fullstack",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

research/test_mongodb.ipynb ADDED Viewed

File without changes

research/test_table.md ADDED Viewed

	@@ -0,0 +1,6 @@

+| Location       | Date       | Average Temperature (°C) | Maximum Temperature (°C) | Minimum Temperature (°C) |
+|----------------|------------|--------------------------|--------------------------|--------------------------|
+| New York       | 2024-09-20 | 22                       | 25                       | 19                       |
+| Los Angeles    | 2024-09-20 | 26                       | 29                       | 23                       |
+| Chicago        | 2024-09-20 | 20                       | 23                       | 17                       |
+| Miami          | 2024-09-20 | 28                       | 31                       | 25                       |

script/__init__.py ADDED Viewed

File without changes

script/build_vector.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from llama_index.core import VectorStoreIndex
+from llama_index.core import StorageContext
+from pinecone import Pinecone, ServerlessSpec
+from llama_index.llms.openai import OpenAI
+from llama_index.vector_stores.pinecone import PineconeVectorStore
+from fastapi import HTTPException, status
+from config import PINECONE_CONFIG
+import os
+import json
+class IndexManager:
+    def __init__(self):
+        self.vector_index = None
+        self.index_name = "summarizer-semantic-index"
+    def _get_pinecone_client(self):
+        """Initialize and return the Pinecone client."""
+        # api_key = os.getenv("PINECONE_API_KEY")
+        api_key = PINECONE_CONFIG.PINECONE_API_KEY
+        if not api_key:
+            raise ValueError(
+                "Pinecone API key is missing. Please set it in environment variables."
+            )
+        return Pinecone(api_key=api_key)
+    def _create_pinecone_index(self, client):
+        """Create Pinecone index if it doesn't already exist."""
+        if self.index_name not in client.list_indexes().names():
+            client.create_index(
+                name=self.index_name,
+                dimension=1536,
+                metric="cosine",
+                spec=ServerlessSpec(cloud="aws", region="us-east-1"),
+            )
+        return client.Index(self.index_name)
+    def _initialize_vector_store(self, pinecone_index):
+        """Initialize and return the vector store with the Pinecone index."""
+        vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
+        return StorageContext.from_defaults(vector_store=vector_store)
+    def build_indexes(self, nodes):
+        """Build vector and tree indexes from nodes."""
+        try:
+            client = self._get_pinecone_client()
+            pinecone_index = self._create_pinecone_index(client)
+            storage_context = self._initialize_vector_store(pinecone_index)
+            self.vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
+            self.vector_index.set_index_id("vector")
+            print(f"Vector Index ID: {self.vector_index.index_id}")
+            print("Vector Index created successfully.")
+            response = {
+                "status": "success",
+                "message": "Existing Vector Index loaded successfully.",
+            }
+            return json.dumps(response)
+        except HTTPException as http_exc:
+            raise http_exc  # Re-raise HTTPExceptions to ensure FastAPI handles them
+        except Exception as e:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Error loading existing indexes: {str(e)}"
+            )
+    def load_existing_indexes(self):
+        """Load existing indexes from Pinecone."""
+        try:
+            client = self._get_pinecone_client()
+            pinecone_index = client.Index(self.index_name)
+            print(pinecone_index.describe_index_stats())
+            vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
+            retriever = VectorStoreIndex.from_vector_store(vector_store)
+            print("Existing Vector Index loaded successfully.")
+            return retriever
+        except Exception as e:
+            print(f"Error loading existing indexes: {e}")
+            raise

script/document_uploader.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from llama_index.core.ingestion import IngestionPipeline
+from llama_index.core.extractors import PydanticProgramExtractor
+from llama_index.embeddings.openai import OpenAIEmbedding
+from config import PINECONE_CONFIG
+from pinecone.grpc import PineconeGRPC as Pinecone
+from service.reader import Reader
+from script.get_metadata import Metadata
+from fastapi import UploadFile, HTTPException,status
+from llama_index.core.node_parser import (
+    SentenceSplitter,
+    SemanticSplitterNodeParser,
+)
+# from script.get_topic import extract_topic
+import logging
+import random
+class Uploader:
+    # def __init__(self, reference, file: UploadFile, content_table: UploadFile):
+    def __init__(self, reference, file: UploadFile):
+        self.file = file
+        # self.content_table = content_table
+        self.reader = Reader()
+        self.reference = reference
+        self.metadata = Metadata(reference)
+    async def ingest_documents(self, file: UploadFile):
+        """Load documents from the storage path."""
+        documents = await self.reader.read_from_uploadfile(file)
+        print("document successfully ingested")
+        return documents
+    def check_existing_metadata(self, pinecone_index, title, random_vector):
+        try:
+            result = pinecone_index.query(
+                vector=random_vector,
+                top_k=1,
+                filter={
+                    "title": {"$eq": title},
+                },
+            )
+            return result["matches"]
+        except Exception as e:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Error check existing metadata {str(e)}",
+            )
+    async def process_documents(self):
+        # Ingest documents
+        print("test")
+        documents = await self.ingest_documents(self.file)
+        print("documents")
+        # topic_extractor = extract_topic(self.reference, self.content_table)
+        embed_model = OpenAIEmbedding()
+        # Get metadata
+        documents_with_metadata = self.metadata.apply_metadata(documents)
+        print("documents_with_metadata")
+        # document_filtered = self.filter_document(documents_with_metadata)
+        # Set up the ingestion pipeline
+        # pipeline = IngestionPipeline(
+        #     transformations=[
+        #         SemanticSplitterNodeParser(
+        #             buffer_size=1,
+        #             breakpoint_percentile_threshold=95,
+        #             embed_model=embed_model,
+        #         ),
+        #         # topic_extractor,
+        #     ]
+        # )
+        splitter = SemanticSplitterNodeParser(
+            buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
+        )
+        #  Run the pipeline
+        try:
+            # nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
+            nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
+            print("Pipeline processing completed updated.")
+            return nodes_with_metadata
+        except Exception as e:
+            # Log the error and raise HTTPException for FastAPI
+            logging.error(f"An error occurred in making pipeline: {e}")
+            raise HTTPException(
+                status_code=500,
+                detail="An internal server error occurred making pipeline.",
+            )
+    def filter_document(self, documents):
+        api_key = PINECONE_CONFIG.PINECONE_API_KEY
+        client = Pinecone(api_key=api_key)
+        pinecone_index = client.Index("test")
+        random_vector = [random.uniform(0, 1) for _ in range(1536)]
+        filtered_documents = []
+        for doc in documents:
+            result = self.check_existing_metadata(
+                pinecone_index, doc.metadata["title"], random_vector
+            )
+            if len(result) == 0:
+                filtered_documents.append(doc)
+        return filtered_documents

script/get_metadata.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Get reference
+class Metadata:
+    def __init__(self, reference):
+        self.title = reference["title"]
+        self.author = reference["author"]
+        self.category = reference["category"]
+        self.year = reference["year"]
+        self.publisher = reference["publisher"]
+    def add_metadata(self, documents, metadata):
+        """Add metadata to each item (document or node)."""
+        for document in documents:
+            if not hasattr(document, "metadata") or document.metadata is None:
+                document.metadata = {}
+            document.metadata.update(metadata)
+            print("metadata is added")
+            # self.logger.log_action(f"Metadata added to document {item.id_}", action_type="METADATA")
+        return documents
+    def _generate_metadata(self):
+        """Generate metadata and return it."""
+        metadata = {
+            "title": self.title,
+            "author": self.author,
+            "category": self.category,
+            "year": self.year,
+            "publisher": self.publisher,
+            "reference": f"{self.author}. ({self.year}). *{self.title}*. {self.publisher}.",  # APA style reference
+        }
+        print("metadata is generated")
+        return metadata
+    def apply_metadata(self, documents):
+        """Apply generated metadata to documents."""
+        metadata = self._generate_metadata()
+        print("metadata is applied")
+        return self.add_metadata(documents, metadata)

script/get_topic.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import nest_asyncio
+import os
+from dotenv import load_dotenv
+from jinja2 import Template
+from pydantic import BaseModel, Field
+from pymongo.mongo_client import MongoClient
+from llama_index.program.openai import OpenAIPydanticProgram
+from llama_index.core.extractors import PydanticProgramExtractor
+from llama_index.llms.openai import OpenAI
+from core.prompt import ADD_METADATA_TEMPLATE
+from core.summarization.summarizer import SummarizeGenerator
+nest_asyncio.apply()
+load_dotenv()
+class NodeMetadata(BaseModel):
+    """Metadata for nodes, capturing topic and subtopic from the book."""
+    topic: str = Field(
+        ...,
+        description="The main subject or category that the node is associated with, representing a broad theme within the book.",
+    )
+    subtopic: str = Field(
+        ...,
+        description="A more specific aspect or section under the main topic, refining the context of the node within the book.",
+    )
+def extract_topic(references, content_table):
+    uri = os.getenv("MONGO_URI")
+    client = MongoClient(uri)
+    try:
+        client.admin.command('ping')
+        print("Pinged your deployment. You successfully connected to MongoDB!")
+    except Exception as e:
+        print(e)
+        # Access a specific database
+    db = client["summarizer"]
+    # Access a collection within the database
+    collection = db["topic_collection"]
+    generate_content_table = SummarizeGenerator(references)
+    extractor_output, extractor_dics  = generate_content_table.extract_content_table(content_table)
+    print(extractor_output)
+    data_to_insert = {
+    "title": references["title"],
+    **extractor_dics  # Unpack the extractor_output dictionary
+    }
+    collection.insert_one(data_to_insert)
+    add_metadata_template = str(
+        Template(ADD_METADATA_TEMPLATE).render(extractor_output=extractor_output)
+    )
+    print("add metadata template : ", add_metadata_template)
+    llm = OpenAI(temperature=0.1, model="gpt-4o-mini")
+    openai_program = OpenAIPydanticProgram.from_defaults(
+        output_cls=NodeMetadata,
+        prompt_template_str="{input}",
+        extract_template_str=add_metadata_template,
+        llm=llm,
+    )
+    topic_extractor = PydanticProgramExtractor(
+        program=openai_program,
+        input_key="input",
+        show_progress=True,
+        extract_template_str=add_metadata_template,
+        llm=llm,
+    )
+    return topic_extractor

service/__init__.py ADDED Viewed

File without changes

service/aws_loader.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import boto3
+import tempfile
+import fitz
+from io import BytesIO
+from fastapi import HTTPException
+class Loader:
+    def __init__(self):
+        # Create S3 and Transcribe clients with credentials
+        self.bucket_name = "multimedika"
+        self.s3_client = boto3.client(
+            "s3",
+            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
+            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
+            region_name="us-west-2",
+        )
+    def upload_to_s3(self, file, object_name, folder_name="summarizer"):
+        try:
+            # If folder_name is provided, prepend it to the object_name
+            if folder_name:
+                object_name = f"{folder_name}/{object_name}"
+            # Create an in-memory file-like object
+            with BytesIO() as file_stream:
+                # Write the contents of the uploaded file to the stream
+                file_stream.write(file.file.read())
+                file_stream.seek(0)  # Move to the beginning of the stream
+                # Upload file to S3
+                self.s3_client.upload_fileobj(file_stream, self.bucket_name, object_name)
+            print(f"File '{object_name}' successfully uploaded to bucket '{self.bucket_name}'.")
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Error uploading to AWS: {e}")
+    def get_file_aws(self, object_name, local_file_name=None):
+        """Downloads a PDF file from S3 and reads it using PyMuPDF."""
+        if local_file_name is None:
+            local_file_name = "downloaded_pdf_file.pdf"  # Default file name
+        try:
+            # Create a temporary directory to store the file
+            temp_dir = tempfile.mkdtemp()
+            file_path = os.path.join(temp_dir, local_file_name)
+            # Download the file from S3
+            with open(file_path, "wb") as temp_file:
+                self.s3_client.download_fileobj(
+                    self.bucket_name, object_name, temp_file
+                )
+            # Open and read the PDF using PyMuPDF
+            doc = fitz.open(file_path)
+            # Example: Print the number of pages
+            print(f"Number of pages: {doc.page_count}")
+            # Do something with the PDF, like read text
+            for page in doc:
+                print(page.get_text())
+            # Close the document
+            doc.close()
+            # Clean up the downloaded file if needed
+            os.remove(file_path)
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Error get file file in aws: {e}")