diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..ab62dc3a19fdccd705dc3b868b181a2a1d69b7d8 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.11-slim + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + gcc libglib2.0-0 libsm6 libxext6 libxrender-dev \ + git \ + && rm -rf /var/lib/apt/lists/* + +RUN useradd -m -u 1000 user +USER user +ENV PATH="/home/user/.local/bin:$PATH" +WORKDIR /app + +# 1. Copy requirements +COPY --chown=user ./requirements.txt requirements.txt + +# 2. Cài pip + dependencies chính +RUN pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir --upgrade -r requirements.txt + +# # 3. Cài torch bản CPU (bắt buộc, cài trước để các package khác nhận ra) +# RUN pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu + +# # 4. Cài langchain_docling (không kèm torch nữa) +# RUN pip install --no-cache-dir langchain_docling + +# 5. Copy code vào image +COPY --chown=user . /app +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"] diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..a1a3c79c7d4b32345010f09a06f8f49f626ca6ce --- /dev/null +++ b/app.py @@ -0,0 +1,13 @@ +from dotenv import load_dotenv + +load_dotenv(override=True) + +from src.apis.create_app import create_app, api_router +import uvicorn + + +app = create_app() + +app.include_router(api_router) +if __name__ == "__main__": + uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6412b64a542b0cc912dc3d5734b9aaa0a8d20ef --- /dev/null +++ b/requirements.txt @@ -0,0 +1,29 @@ +langgraph +langchain +langchain-community +python-dotenv +fastapi +motor +langchain-google-genai +langchain-openai +langchain-pinecone +PyMuPDF +pytz +uvicorn[standard] +python-multipart +langchain_experimental +duckduckgo-search +pydantic[email] +python_jose==3.3.0 +pillow +python-docx +langchain-text-splitters +PyJWT==2.8.0 +pymupdf +docx2txt +gitpython +tiktoken +google-genai +langchain-core +youtube-transcript-api +youtube-comment-downloader \ No newline at end of file diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..d305152f5a60910fad7b4c659e5ba0b86d822e32 Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/agents/agent_transcript/__pycache__/flow.cpython-311.pyc b/src/agents/agent_transcript/__pycache__/flow.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de4ad8a35168e34d7274627b9905dfb14cf60254 Binary files /dev/null and b/src/agents/agent_transcript/__pycache__/flow.cpython-311.pyc differ diff --git a/src/agents/agent_transcript/__pycache__/func.cpython-311.pyc b/src/agents/agent_transcript/__pycache__/func.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46db2d873356e353b9d2c150a3536394f98be119 Binary files /dev/null and b/src/agents/agent_transcript/__pycache__/func.cpython-311.pyc differ diff --git a/src/agents/agent_transcript/__pycache__/prompt.cpython-311.pyc b/src/agents/agent_transcript/__pycache__/prompt.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f079df2e1ca7d7e4f838eae9ac3b42bcf0a87a4 Binary files /dev/null and b/src/agents/agent_transcript/__pycache__/prompt.cpython-311.pyc differ diff --git a/src/agents/agent_transcript/__pycache__/tools.cpython-311.pyc b/src/agents/agent_transcript/__pycache__/tools.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d00f93e4184e2910ec7081a3964970f3d1d9934d Binary files /dev/null and b/src/agents/agent_transcript/__pycache__/tools.cpython-311.pyc differ diff --git a/src/agents/agent_transcript/flow.py b/src/agents/agent_transcript/flow.py new file mode 100644 index 0000000000000000000000000000000000000000..26c55e73874559054ba8ccce56101d20ae51c640 --- /dev/null +++ b/src/agents/agent_transcript/flow.py @@ -0,0 +1,74 @@ +from langgraph.graph import StateGraph, START, END +from .func import ( + State, + trim_history, + extract_transcript_and_comment, + script_structure_analyzer, + comment_insight_extractor, + scientific_fact_finder, + script_re_outline, + script_writer_init, + script_writer_single, + should_continue_writing, + script_writer_end, +) +from langgraph.graph.state import CompiledStateGraph + + +class AgentTranscript: + def __init__(self): + self.builder = StateGraph(State) + + def node(self): + self.builder.add_node("trim_history", trim_history) + self.builder.add_node( + "extract_transcript_and_comment", extract_transcript_and_comment + ) + self.builder.add_node("script_structure_analyzer", script_structure_analyzer) + self.builder.add_node("comment_insight_extractor", comment_insight_extractor) + self.builder.add_node("scientific_fact_finder", scientific_fact_finder) + self.builder.add_node("script_re_outline", script_re_outline) + self.builder.add_node("script_writer_init", script_writer_init) + self.builder.add_node("script_writer_single", script_writer_single) + self.builder.add_node("script_writer_end", script_writer_end) + + def edge(self): + self.builder.add_edge(START, "trim_history") + self.builder.add_edge("trim_history", "extract_transcript_and_comment") + self.builder.add_edge( + "extract_transcript_and_comment", "script_structure_analyzer" + ) + self.builder.add_edge("script_structure_analyzer", "comment_insight_extractor") + self.builder.add_edge("comment_insight_extractor", "scientific_fact_finder") + self.builder.add_edge("scientific_fact_finder", "script_re_outline") + self.builder.add_edge("script_re_outline", "script_writer_init") + + # Conditional routing for script writing + self.builder.add_conditional_edges( + "script_writer_init", + should_continue_writing, + { + "script_writer_single": "script_writer_single", + "script_writer_end": "script_writer_end" + } + ) + + self.builder.add_conditional_edges( + "script_writer_single", + should_continue_writing, + { + "script_writer_single": "script_writer_single", + "script_writer_end": "script_writer_end" + } + ) + + self.builder.add_edge("script_writer_end", END) + + def __call__(self) -> CompiledStateGraph: + self.node() + self.edge() + + return self.builder.compile() + + +script_writer_agent = AgentTranscript()() diff --git a/src/agents/agent_transcript/func.py b/src/agents/agent_transcript/func.py new file mode 100644 index 0000000000000000000000000000000000000000..fde40f09295d280d1c33d7c7454103f9ee016e54 --- /dev/null +++ b/src/agents/agent_transcript/func.py @@ -0,0 +1,182 @@ +from typing import TypedDict, Optional, List +from langchain_core.messages import AnyMessage, ToolMessage, HumanMessage, AIMessage +from langgraph.graph.message import add_messages +from typing import Sequence, Annotated +from langchain_core.messages import RemoveMessage +from langchain_core.documents import Document +from src.config.llm import get_llm +from src.utils.logger import logger +from src.utils.helper import extract_transcript, extract_comment +from .prompt import * +import operator + + +class State(TypedDict): + video_link: str + messages: Annotated[Sequence[AnyMessage], add_messages] + transcript: str + comment: str + script_structure_analyzer_response: str + comment_insight_extractor_response: str + research_insight_response: str + script_re_outline_response: str + script_writer_response: List[str] + target_word_count: int + script_count: int + current_script_index: int + + +def trim_history(state: State): + history = state.get("messages", []) + + if len(history) > 20: + num_to_remove = len(history) - 20 + remove_messages = [ + RemoveMessage(id=history[i].id) for i in range(num_to_remove) + ] + return { + "messages": remove_messages, + "selected_ids": [], + "selected_documents": [], + } + + return {} + + +def extract_transcript_and_comment(state: State): + transcript = extract_transcript(state["video_link"]) + comment = extract_comment(state["video_link"]) + + # Calculate script count based on target word count + # Assume each script is around 200-300 words + avg_words_per_script = 1000 + script_count = max(1, state.get("target_word_count", 8000) // avg_words_per_script) + + return { + "transcript": transcript, + "comment": comment, + "script_count": script_count, + "messages": HumanMessage( + content=f"Will generate {script_count} scripts for {state.get('target_word_count', 8000)} words target" + ), + } + + +def script_structure_analyzer(state: State): + transcript = state["transcript"] + response = chain_script_structure_analyzer.invoke({"script": transcript}) + return { + "script_structure_analyzer_response": response.content, + "messages": HumanMessage( + content="Script Structure Analyzer Response: " + response.content + ), + } + + +def comment_insight_extractor(state: State): + response = chain_comment_insight_extractor.invoke( + { + "comment": state["comment"], + "script_structure_analyzer_response": state[ + "script_structure_analyzer_response" + ], + } + ) + return { + "comment_insight_extractor_response": response.content, + "messages": HumanMessage( + content="Comment Insight Extractor Response: " + response.content + ), + } + + +def scientific_fact_finder(state: State): + input_message = {} + input_message["messages"] = [ + { + "role": "user", + "content": f"""Hãy tìm 3-5 nghiên cứu khoa học thực tế (PubMed, JAMA, Circulation, Nutrients…), +Tóm tắt số liệu, trích nguồn, gợi ý số liệu phù hợp cho từng đoạn trong script mới. Dựa trên các thông tin sau: +Script Structure Analyzer Response: {state["script_structure_analyzer_response"]} +Comment Insight Extractor Response: {state["comment_insight_extractor_response"]} +""", + } + ] + response = scientific_fact_agent.invoke(input_message) + research_insight = response["messages"][-1].content + return { + "research_insight_response": research_insight, + "messages": HumanMessage( + content="Scientific Fact Finder Response: " + research_insight + ), + } + + +def script_re_outline(state: State): + response = chain_script_re_outline.invoke({"messages": state["messages"]}) + return { + "script_re_outline_response": response.content, + "messages": HumanMessage( + content="Script Re-Outline Response: " + response.content + ), + } + + +def script_writer_init(state: State): + """Initialize script writing process""" + return { + "script_writer_response": [], + "current_script_index": 0, + "messages": HumanMessage(content="Starting script generation process..."), + } + + +def script_writer_single(state: State): + """Generate a single script""" + current_index = state.get("current_script_index", 0) + script_count = state.get("script_count", 10) + target_word_count = state.get("target_word_count", 8000) + words_per_script = target_word_count // script_count if script_count > 0 else 1000 + + # Get existing scripts + script_out = list(state.get("script_writer_response", [])) + current_messages = list(state["messages"]) + + # Add word count guidance to the prompt + if current_index == 0: + word_prompt = f"Hãy viết script đầu tiên với khoảng {words_per_script} từ." + else: + word_prompt = f"ok, viết cho tôi phần tiếp theo, bám sát cấu trúc, khoảng {words_per_script} từ cho script này, các công thức tạo cảm xúc và đừng quên đối tượng khán giả là người Mỹ,giới tính nữ, trên 20 tuổi, bắt đầu, trình bày thành dạng câu văn liền mạch, dùng để làm văn nói cho video YouTube, không dùng icon" + + current_messages.append(HumanMessage(content=word_prompt)) + + # Generate script + response = chain_script_writer.invoke({"messages": current_messages}) + script_out.append(response.content) + + # Add response to message history + current_messages.append(AIMessage(content=response.content)) + + return { + "script_writer_response": script_out, + "current_script_index": current_index + 1, + "messages": current_messages + + [ + HumanMessage(content=f"Script {current_index + 1}/{script_count} completed") + ], + } + + +def should_continue_writing(state: State): + """Check if we should continue writing more scripts""" + current_index = state.get("current_script_index", 0) + script_count = state.get("script_count", 10) + return ( + "script_writer_single" if current_index < script_count else "script_writer_end" + ) + + +def script_writer_end(state: State): + """Finalize script writing""" + script_count = len(state.get("script_writer_response", [])) + return {"messages": HumanMessage(content=f"All {script_count} scripts completed!")} diff --git a/src/agents/agent_transcript/prompt.py b/src/agents/agent_transcript/prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..65daa6e802abd21f67f3d3fede1c454db8fa52f2 --- /dev/null +++ b/src/agents/agent_transcript/prompt.py @@ -0,0 +1,182 @@ +from langchain_core.prompts import ChatPromptTemplate +from src.config.llm import llm_2_0 as llm, llm_2_5_flash_preview +from pydantic import BaseModel, Field +from langchain_community.tools import DuckDuckGoSearchResults +from langgraph.prebuilt import create_react_agent + +duckduckgo_search = DuckDuckGoSearchResults(max_results=10, output_format="json") + +script_structure_analyzer_prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + """ +Vai trò: Bạn là Script Structure Analyzer trong một workflow của một nhóm các agent. +Instruction: +- Tự động phân tích kịch bản gốc, tách các phần (Mở bài, Thân bài, Điểm chốt, CTA) +- Xác định công thức cấu trúc (AIDA, PAS, BFB,...) +- Trích xuất hook, câu chuyển đoạn, CTA +- Phát hiện điểm mạnh/yếu/chỗ lạc nhịp + +Input: Script gốc +Output: +- Outline: + - Mở bài + - Thân bài + - Điểm chốt + - CTA +- Công thức cấu trúc + - AIDA: Attention, Interest, Desire, Action + - PAS: Problem, Agitation, Solution + - BFB: Belief, Feeling, Behavior +- Hook +- Câu chuyển đoạn +- CTA +- Điểm mạnh/yếu/chỗ lạc nhịp +""", + ), + ("user", "input script: {script}"), + ] +) + +comment_insight_extractor_prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + """ +Vai trò: Bạn là Comment Insight Extractor trong một workflow của một nhóm các agent phân tích youtube video. +Instruction: +- Đọc, phân tích tất cả comment, trích xuất insight +- lọc ra các câu hỏi lặp lại, nỗi sợ/mong muốn/lợi ích/ngôn ngữ quen thuộc +- So sánh insight với script gốc và xác định thiếu sót. + +Input: +- Output từ Script Structure Analyzer Agent Youtube Video +- Comment + +Output: +- Insights Table: + - Insight + - Original Comment + - Pain or Benefit + - Suggest for Script +- Missing From Script +- Repeated Questions +- Audience Language + +""", + ), + ("user", "input comment: {comment}"), + ( + "user", + "input script_structure_analyzer_response: {script_structure_analyzer_response}", + ), + ] +) +scientific_fact_finder_prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + """ +Vai trò: Bạn là Scientific Fact Finder trong một workflow của một nhóm các agent phân tích youtube video. +Instruction: +- Tự động research 3-5 nghiên cứu khoa học thực tế (PubMed, JAMA, Circulation, Nutrients…), tóm tắt số liệu, trích nguồn, gợi ý số liệu phù hợp cho từng đoạn trong script mới. +- So sánh fact science với script gốc và xác định thiếu sót. + +Input: +- Output từ Script Structure Analyzer Agent Youtube Video +- Output từ Comment Insight Extractor Agent Youtube Video + +Output List: +- Title: Tên nghiên cứu +- Summary: Tóm tắt nghiên cứu +- Source: Nguồn nghiên cứu +- Relevant for Section: Relevant cho section nào trong script mới +""", + ), + ("placeholder", "{messages}"), + ] +) + +script_re_outline_prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + """ +Vai trò: Bạn là Script Re-Outline Agent trong một workflow của một nhóm các agent. +Instruction: +Kết hợp outline cũ, insight từ comment, fact từ research để lập outline mới: Hook mới, thứ tự section mới, CTA mới, các ý chuyển mạch rõ ràng, phân bổ fact/nghiên cứu vào các section. + +Input: +- Output từ Script Structure Analyzer Agent +- Output từ Comment Insight Extractor Agent +- Output từ Scientific Fact Finder Agent + +Output: + +- Outline mới: (Section, summary, suggested length, facts to include) + - Hook mở bài + - Thân bài 1 + - Thân bài 2 + - Điểm chốt + - CTA +- CTA position +- Transitions +- Order Logic +""", + ), + ("placeholder", "{messages}"), + ] +) + +script_writer_prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + """ +Vai trò: Bạn là Script Writer dựa trên các nội dung, insight được cung cấp. +Instruction: +- Viết lại từng phần dựa theo outline mới, dữ liệu nghiên cứu, insight comment, giữ văn liền mạch - cảm xúc - kể chuyện, format cho video YouTube (dạng văn nói, không dùng icon, chỉ text). +- Viết theo hội thoại chỉ có một người nói, không có người khác. + +Input: +- Output từ Script Re-Outline Agent (Important) +- Output từ Scientific Fact Finder Agent +- Output từ Comment Insight Extractor Agent + +Processing: +- Sau khi viết 1 phần, ngừng ngay. +- Output phải liền mạch, không có gạch đầu dòng. +- Tone giọng thân thiện, kể truyện, truyền cảm xúc, không dùng icon, chỉ dùng text. +- Cài hook cảm xúc, ví dụ thực tế +- Kể mở ra CTA hoặc dẫn sang phần tiếp theo. +- Có câu hỏi tu từ nhẹ nhàng +- Nhắc lại lợi ích quan trọng +- So sánh "thay vì... thì..." để khán giả thấy rõ "why" +- Không dùng icon, emoji +- Kết thúc phải là kết thúc mở đề người dùng có thể yêu cầu viết tiếp thay vì kết thúc sau khi hoàn thành đủ hook, thân bài, điểm chốt, CTA. +Output: +- Title: Tên của phần nội dung +- Content: Script content + +Lưu ý: Chỉ gen ra một phần nội dung. +- Nếu user nhập 'ok, viết cho tôi phần tiếp theo, bám sát cấu trúc, số lượng từ cho mỗi mục trong outline, các công thức tạo cảm xúc và đừng quên đối tượng khán giả là người Mỹ,giới tính nữ, trên 20 tuổi, bắt đầu, trình bày thành dạng câu văn liền mạch, dùng để làm văn nói cho video YouTube, không dùng icon' thì tiếp tục viết tiếp. + +""", + ), + ("placeholder", "{messages}"), + ] +) + + +chain_script_structure_analyzer = script_structure_analyzer_prompt | llm +chain_comment_insight_extractor = comment_insight_extractor_prompt | llm +scientific_fact_agent = create_react_agent( + model=llm, + tools=[duckduckgo_search], + prompt=scientific_fact_finder_prompt, +) + + +chain_script_re_outline = script_re_outline_prompt | llm +chain_script_writer = script_writer_prompt | llm_2_5_flash_preview diff --git a/src/agents/agent_transcript/tools.py b/src/agents/agent_transcript/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..c87186f5437547e0e9a0e657339caad01b30012a --- /dev/null +++ b/src/agents/agent_transcript/tools.py @@ -0,0 +1,54 @@ +# from langchain_core.tools import tool +# from src.utils.helper import convert_list_context_source_to_str +# from src.utils.logger import logger +# from langchain_core.runnables import RunnableConfig +# from langchain_experimental.utilities import PythonREPL +# from langchain_community.tools import DuckDuckGoSearchRun + + +# duckduckgo_search = DuckDuckGoSearchRun(max_results=10, output_format="json") + +# python_exec = PythonREPL() + + +# @tool +# def retrieve_document(query: str, config: RunnableConfig): +# """Ưu tiên truy xuất tài liệu từ vector store nếu câu hỏi liên quan đến vai trò của chatbot. + +# Args: +# query (str): Câu truy vấn của người dùng bằng tiếng Việt +# Returns: +# str: Retrieved documents +# """ +# configuration = config.get("configurable", {}) +# bot_id = configuration.get("bot_id", None) +# if not bot_id: +# logger.error("Bot ID is not found") +# return {"context_str": "", "selected_documents": [], "selected_ids": []} +# retriever = test_rag_vector_store.as_retriever( +# search_type="similarity_score_threshold", +# search_kwargs={"k": 5, "score_threshold": 0.3}, +# ) +# documents = retriever.invoke(query, filter={"bot_id": bot_id}) +# selected_documents = [doc.__dict__ for doc in documents] +# selected_ids = [doc["id"] for doc in selected_documents] +# context_str = convert_list_context_source_to_str(documents) + +# return { +# "context_str": context_str, +# "selected_documents": selected_documents, +# "selected_ids": selected_ids, +# } + + +# @tool +# def python_repl(code: str): +# """ +# A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`. + +# Args: +# code (str): Python code to execute +# Returns: +# str: Output of the Python code +# """ +# return python_exec.run(code) diff --git a/src/agents/base/flow.py b/src/agents/base/flow.py new file mode 100644 index 0000000000000000000000000000000000000000..98470dd664cfcf086a21063ab8c8491763b0cad2 --- /dev/null +++ b/src/agents/base/flow.py @@ -0,0 +1,23 @@ +from langgraph.graph import StateGraph, START, END +from src.config.llm import llm_2_0 +from .func import State +from langgraph.graph.state import CompiledStateGraph + + +class PrimaryChatBot: + def __init__(self): + self.builder = StateGraph(State) + + @staticmethod + def routing(state: State): + pass + + def node(self): + pass + + def edge(self): + pass + def __call__(self) -> CompiledStateGraph: + self.node() + self.edge() + return self.builder.compile() diff --git a/src/agents/base/func.py b/src/agents/base/func.py new file mode 100644 index 0000000000000000000000000000000000000000..c1a8c978737a1c7fb2919b7c3fd9581a1234d5fe --- /dev/null +++ b/src/agents/base/func.py @@ -0,0 +1,4 @@ +from typing import TypedDict + +class State(TypedDict): + pass \ No newline at end of file diff --git a/src/apis/.DS_Store b/src/apis/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f7dad2034a4ac252c5fc74ffd764625d53097111 Binary files /dev/null and b/src/apis/.DS_Store differ diff --git a/src/apis/__pycache__/create_app.cpython-311.pyc b/src/apis/__pycache__/create_app.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b70e659a1dd6d9e095bfa4904192600d6fb8751 Binary files /dev/null and b/src/apis/__pycache__/create_app.cpython-311.pyc differ diff --git a/src/apis/controllers/__pycache__/category_controller.cpython-311.pyc b/src/apis/controllers/__pycache__/category_controller.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7bc3f3b32debbfc55207e45c743c3ab348a1bd32 Binary files /dev/null and b/src/apis/controllers/__pycache__/category_controller.cpython-311.pyc differ diff --git a/src/apis/controllers/__pycache__/order_controller.cpython-311.pyc b/src/apis/controllers/__pycache__/order_controller.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d2092e9b27737a12b77ce1d3c7be0e0820a173b Binary files /dev/null and b/src/apis/controllers/__pycache__/order_controller.cpython-311.pyc differ diff --git a/src/apis/controllers/__pycache__/service_controller.cpython-311.pyc b/src/apis/controllers/__pycache__/service_controller.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f9a625bf6799b9d3d4fb65b72082fb0e1da6ffe Binary files /dev/null and b/src/apis/controllers/__pycache__/service_controller.cpython-311.pyc differ diff --git a/src/apis/controllers/__pycache__/user_controller.cpython-311.pyc b/src/apis/controllers/__pycache__/user_controller.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4010402ac8f2001997c11b53eacb0e336e920c13 Binary files /dev/null and b/src/apis/controllers/__pycache__/user_controller.cpython-311.pyc differ diff --git a/src/apis/controllers/__pycache__/user_service_controller.cpython-311.pyc b/src/apis/controllers/__pycache__/user_service_controller.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b7402ab0a4b640e5e7d4925d6b4fb769408fe3b Binary files /dev/null and b/src/apis/controllers/__pycache__/user_service_controller.cpython-311.pyc differ diff --git a/src/apis/create_app.py b/src/apis/create_app.py new file mode 100644 index 0000000000000000000000000000000000000000..cacae0e8ab528a9d0667b6e57435b11777c2122f --- /dev/null +++ b/src/apis/create_app.py @@ -0,0 +1,23 @@ +from fastapi import FastAPI, APIRouter +from fastapi.middleware.cors import CORSMiddleware +from src.apis.routers.gen_script import router as gen_script_router + + +api_router = APIRouter() +api_router.include_router(gen_script_router) + + +def create_app(): + app = FastAPI( + docs_url="/", + title="AI Service ABAOXOMTIEU", + ) + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + return app diff --git a/src/apis/interfaces/__pycache__/auth_interface.cpython-311.pyc b/src/apis/interfaces/__pycache__/auth_interface.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a40c0beef38148bfb5fdb41e352e09e92467af0 Binary files /dev/null and b/src/apis/interfaces/__pycache__/auth_interface.cpython-311.pyc differ diff --git a/src/apis/interfaces/__pycache__/chat_interface.cpython-311.pyc b/src/apis/interfaces/__pycache__/chat_interface.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b73453f226c2cbd730bedb819e346ee299de05ce Binary files /dev/null and b/src/apis/interfaces/__pycache__/chat_interface.cpython-311.pyc differ diff --git a/src/apis/interfaces/__pycache__/file_processing_interface.cpython-311.pyc b/src/apis/interfaces/__pycache__/file_processing_interface.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e13f71374f83a293f4f5eac99c5c35d50ff7a74b Binary files /dev/null and b/src/apis/interfaces/__pycache__/file_processing_interface.cpython-311.pyc differ diff --git a/src/apis/middlewares/__pycache__/auth_middleware.cpython-311.pyc b/src/apis/middlewares/__pycache__/auth_middleware.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a97b7e2a5d7ea7a6636d8b17757d091e1043d42c Binary files /dev/null and b/src/apis/middlewares/__pycache__/auth_middleware.cpython-311.pyc differ diff --git a/src/apis/middlewares/auth_middleware.py b/src/apis/middlewares/auth_middleware.py new file mode 100644 index 0000000000000000000000000000000000000000..766145c717dcf1586c7dabe4727a25874317607e --- /dev/null +++ b/src/apis/middlewares/auth_middleware.py @@ -0,0 +1,40 @@ +from typing import Annotated +from fastapi import Depends +from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer +from fastapi.responses import JSONResponse +from src.apis.providers.jwt_provider import jwt_provider as jwt +from src.apis.models.user_models import get_user +from src.config.mongo import UserCRUD +from bson import ObjectId +from jose import JWTError +from src.utils.logger import logger + +security = HTTPBearer() + + +async def get_current_user( + credentials: Annotated[HTTPAuthorizationCredentials, Depends(security)] +): + + try: + token = credentials.credentials + if not token: + return JSONResponse( + content={"msg": "Authentication failed"}, status_code=401 + ) + payload = jwt.decrypt(token) + user_id: str = payload["id"] + if not user_id: + return JSONResponse( + content={"msg": "Authentication failed"}, status_code=401 + ) + user = await UserCRUD.read_one({"_id": ObjectId(user_id)}) + user_email = user.get("email", None) + logger.info(f"Request of user: {user_email}") + if not user: + return JSONResponse( + content={"msg": "Authentication failed"}, status_code=401 + ) + return get_user(user) + except JWTError: + return JSONResponse(content={"msg": "Authentication failed"}, status_code=401) diff --git a/src/apis/models/BaseDocument.py b/src/apis/models/BaseDocument.py new file mode 100644 index 0000000000000000000000000000000000000000..80e7202d050fbf8f7c3a035f6f729acf15f08091 --- /dev/null +++ b/src/apis/models/BaseDocument.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel, Field +from typing import Optional +from datetime import datetime +from src.utils.logger import get_date_time + + +class BaseDocument(BaseModel): + created_at: Optional[datetime] = Field( + default_factory=lambda: get_date_time().replace(tzinfo=None) + ) + updated_at: Optional[datetime] = Field( + default_factory=lambda: get_date_time().replace(tzinfo=None) + ) + expire_at: Optional[datetime] = None + + class Config: + arbitrary_types_allowed = True diff --git a/src/apis/models/__pycache__/BaseDocument.cpython-311.pyc b/src/apis/models/__pycache__/BaseDocument.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc1c20c4a37bd87e4628fb864898ace367b7efd6 Binary files /dev/null and b/src/apis/models/__pycache__/BaseDocument.cpython-311.pyc differ diff --git a/src/apis/models/__pycache__/bot_models.cpython-311.pyc b/src/apis/models/__pycache__/bot_models.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77834d7c9d738b8be37e43462511329afea50d87 Binary files /dev/null and b/src/apis/models/__pycache__/bot_models.cpython-311.pyc differ diff --git a/src/apis/models/__pycache__/category_models.cpython-311.pyc b/src/apis/models/__pycache__/category_models.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f61512a965d12fdf668d82c189f0795a466a6d3b Binary files /dev/null and b/src/apis/models/__pycache__/category_models.cpython-311.pyc differ diff --git a/src/apis/models/__pycache__/grade_models.cpython-311.pyc b/src/apis/models/__pycache__/grade_models.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90c81bdcdaf18f5afec737ef7fdb13de8fe3685b Binary files /dev/null and b/src/apis/models/__pycache__/grade_models.cpython-311.pyc differ diff --git a/src/apis/models/__pycache__/order_models.cpython-311.pyc b/src/apis/models/__pycache__/order_models.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ed10afbe63d723c2190ed7b0b57990b3f83f036 Binary files /dev/null and b/src/apis/models/__pycache__/order_models.cpython-311.pyc differ diff --git a/src/apis/models/__pycache__/service_model.cpython-311.pyc b/src/apis/models/__pycache__/service_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d3fad387ec3a4c73f09e4c3782ea1cb93832cec Binary files /dev/null and b/src/apis/models/__pycache__/service_model.cpython-311.pyc differ diff --git a/src/apis/models/__pycache__/service_provide.cpython-311.pyc b/src/apis/models/__pycache__/service_provide.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0d33fed2eb856c0cf7e8e98428b227d277d4247 Binary files /dev/null and b/src/apis/models/__pycache__/service_provide.cpython-311.pyc differ diff --git a/src/apis/models/__pycache__/user_models.cpython-311.pyc b/src/apis/models/__pycache__/user_models.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73bb118c42705cf91a7f22bfa4c34cb2f87673e6 Binary files /dev/null and b/src/apis/models/__pycache__/user_models.cpython-311.pyc differ diff --git a/src/apis/providers/__pycache__/jwt_provider.cpython-311.pyc b/src/apis/providers/__pycache__/jwt_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2861658c080e5b602b7891d48c80f95e29ced0cb Binary files /dev/null and b/src/apis/providers/__pycache__/jwt_provider.cpython-311.pyc differ diff --git a/src/apis/routers/__pycache__/api_testing_router.cpython-311.pyc b/src/apis/routers/__pycache__/api_testing_router.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bac175fe7aa51851a9b18d7ef10742037f70c841 Binary files /dev/null and b/src/apis/routers/__pycache__/api_testing_router.cpython-311.pyc differ diff --git a/src/apis/routers/__pycache__/auth_router.cpython-311.pyc b/src/apis/routers/__pycache__/auth_router.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34d9d9f2a1c98cae46a3652c2acb9e9191b4fe81 Binary files /dev/null and b/src/apis/routers/__pycache__/auth_router.cpython-311.pyc differ diff --git a/src/apis/routers/__pycache__/custom_chatbot_router.cpython-311.pyc b/src/apis/routers/__pycache__/custom_chatbot_router.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6d6242df1d30e5b75f25c8e744a844f066f1cda Binary files /dev/null and b/src/apis/routers/__pycache__/custom_chatbot_router.cpython-311.pyc differ diff --git a/src/apis/routers/__pycache__/file_processing_router.cpython-311.pyc b/src/apis/routers/__pycache__/file_processing_router.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c707ca8627748aa446411068574717fb7be9c11 Binary files /dev/null and b/src/apis/routers/__pycache__/file_processing_router.cpython-311.pyc differ diff --git a/src/apis/routers/__pycache__/gen_script.cpython-311.pyc b/src/apis/routers/__pycache__/gen_script.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3001832a5e0585e4dc5639537e8b817b2612971 Binary files /dev/null and b/src/apis/routers/__pycache__/gen_script.cpython-311.pyc differ diff --git a/src/apis/routers/__pycache__/grade_code_router.cpython-311.pyc b/src/apis/routers/__pycache__/grade_code_router.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f51ac652bed2d4546132db9352659885f87dc040 Binary files /dev/null and b/src/apis/routers/__pycache__/grade_code_router.cpython-311.pyc differ diff --git a/src/apis/routers/__pycache__/graded_assignment_router.cpython-311.pyc b/src/apis/routers/__pycache__/graded_assignment_router.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7de017d54a4bf2cd06082e11f5b6a3fdb50c8d4 Binary files /dev/null and b/src/apis/routers/__pycache__/graded_assignment_router.cpython-311.pyc differ diff --git a/src/apis/routers/__pycache__/image_generation.cpython-311.pyc b/src/apis/routers/__pycache__/image_generation.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bcb759b9f5df33914569dcaac42ce694391139a0 Binary files /dev/null and b/src/apis/routers/__pycache__/image_generation.cpython-311.pyc differ diff --git a/src/apis/routers/__pycache__/rag_agent_template.cpython-311.pyc b/src/apis/routers/__pycache__/rag_agent_template.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..256dcb54c420fec4f4c3e01ab08f0d8ec5ea5529 Binary files /dev/null and b/src/apis/routers/__pycache__/rag_agent_template.cpython-311.pyc differ diff --git a/src/apis/routers/__pycache__/vector_store_router.cpython-311.pyc b/src/apis/routers/__pycache__/vector_store_router.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..728a1c693d3caa7537cbab24d35b7653c0b9a26c Binary files /dev/null and b/src/apis/routers/__pycache__/vector_store_router.cpython-311.pyc differ diff --git a/src/apis/routers/gen_script.py b/src/apis/routers/gen_script.py new file mode 100644 index 0000000000000000000000000000000000000000..9882ba1cfe4273a06e160ba2fe112e6dc7dca656 --- /dev/null +++ b/src/apis/routers/gen_script.py @@ -0,0 +1,158 @@ +from fastapi import APIRouter +from fastapi.responses import StreamingResponse +from langchain_core.messages import AIMessageChunk +from langchain_core.runnables import RunnableConfig +from src.agents.agent_transcript.flow import script_writer_agent +from src.utils.logger import logger +from pydantic import BaseModel +import json +import asyncio + + +class GenScriptRequest(BaseModel): + video_link: str + target_word_count: int = 50000 # Default 2500 words + + +router = APIRouter() + + +async def message_generator( + input_graph: dict, + config: RunnableConfig, +): + try: + last_output_state = None + + try: + async for event in script_writer_agent.astream( + input=input_graph, stream_mode=["messages", "values"], config=config + ): + try: + event_type, event_message = event + logger.info(f"Event type: {event_type}") + + if event_type == "messages": + message, metadata = event_message + if isinstance(message, AIMessageChunk): + # Stream AI message chunks + node = metadata.get("node") + chunk_data = { + "type": "message_chunk", + "content": message.content, + "metadata": metadata, + "node_step": node, + } + logger.info(f"Chunk data: {chunk_data}") + yield f"data: {json.dumps(chunk_data)}\n\n" + + elif event_type == "values": + # Stream state updates + state_data = {"type": "state_update", "state": event_message} + last_output_state = event_message + + # Handle specific data extractions + if "transcript" in event_message and event_message["transcript"]: + transcript_data = { + "type": "transcript_extracted", + "transcript": event_message["transcript"][:500] + "..." if len(event_message["transcript"]) > 500 else event_message["transcript"], + "full_length": len(event_message["transcript"]) + } + yield f"data: {json.dumps(transcript_data)}\n\n" + + if "comment" in event_message and event_message["comment"]: + comment_data = { + "type": "comment_extracted", + "comment": event_message["comment"][:500] + "..." if len(event_message["comment"]) > 500 else event_message["comment"], + "full_length": len(event_message["comment"]) + } + yield f"data: {json.dumps(comment_data)}\n\n" + + if "script_count" in event_message: + script_count_data = { + "type": "script_count_calculated", + "script_count": event_message["script_count"], + "target_word_count": event_message.get("target_word_count", 8000) + } + yield f"data: {json.dumps(script_count_data)}\n\n" + + # Handle individual script updates + if "script_writer_response" in event_message and "current_script_index" in event_message: + current_scripts = event_message["script_writer_response"] + current_index = event_message["current_script_index"] + script_count = event_message.get("script_count", 10) + + if current_scripts: + individual_script_data = { + "type": "individual_script", + "script_index": current_index, + "script_content": current_scripts[-1] if current_scripts else "", + "progress": f"{current_index}/{script_count}", + "scripts": current_scripts + } + yield f"data: {json.dumps(individual_script_data)}\n\n" + + yield f"data: {json.dumps(state_data, default=str)}\n\n" + + except Exception as e: + logger.error(f"Error processing event: {e}") + error_data = {"type": "error", "message": str(e)} + yield f"data: {json.dumps(error_data)}\n\n" + + except Exception as e: + logger.error(f"Error in streaming: {e}") + error_data = {"type": "error", "message": str(e)} + yield f"data: {json.dumps(error_data)}\n\n" + + # Send final result + if last_output_state: + final_data = { + "type": "final_result", + "scripts": last_output_state.get("script_writer_response", []), + "total_scripts": len( + last_output_state.get("script_writer_response", []) + ), + } + yield f"data: {json.dumps(final_data, default=str)}\n\n" + + except Exception as e: + logger.error(f"Fatal error in message_generator: {e}") + yield f"data: {json.dumps({'type': 'fatal_error', 'message': str(e)})}\n\n" + + +@router.post("/gen-script") +async def gen_script(request: GenScriptRequest): + """ + Generate scripts with streaming response + """ + config = RunnableConfig() + input_graph = { + "video_link": request.video_link, + "target_word_count": request.target_word_count + } + + return StreamingResponse( + message_generator(input_graph, config), + media_type="text/plain", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Content-Type": "text/event-stream", + }, + ) + + +@router.post("/gen-script-sync") +def gen_script_sync(request: GenScriptRequest): + """ + Generate scripts with synchronous response (non-streaming) + """ + response = script_writer_agent.invoke({ + "video_link": request.video_link, + "target_word_count": request.target_word_count + }) + return { + "scripts": response.get("script_writer_response", []), + "total_scripts": len(response.get("script_writer_response", [])), + "full_response": response, + } diff --git a/src/config/__pycache__/cloudinary.cpython-311.pyc b/src/config/__pycache__/cloudinary.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef70cdcac66d4993e775bb7b9ed7ca3ac41cad44 Binary files /dev/null and b/src/config/__pycache__/cloudinary.cpython-311.pyc differ diff --git a/src/config/__pycache__/constants.cpython-311.pyc b/src/config/__pycache__/constants.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..458675012121d79f27d30258feb4912b655fd7f7 Binary files /dev/null and b/src/config/__pycache__/constants.cpython-311.pyc differ diff --git a/src/config/__pycache__/direct_chain.cpython-311.pyc b/src/config/__pycache__/direct_chain.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53cd67b7f83224a57d144a6840009ff08bfe9f32 Binary files /dev/null and b/src/config/__pycache__/direct_chain.cpython-311.pyc differ diff --git a/src/config/__pycache__/llm.cpython-311.pyc b/src/config/__pycache__/llm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1af5a88a3a033dd76d538a820be1830397843d0 Binary files /dev/null and b/src/config/__pycache__/llm.cpython-311.pyc differ diff --git a/src/config/__pycache__/mongo.cpython-311.pyc b/src/config/__pycache__/mongo.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6789abd3f5bc120672a427353042ddb243254736 Binary files /dev/null and b/src/config/__pycache__/mongo.cpython-311.pyc differ diff --git a/src/config/__pycache__/vector_store.cpython-311.pyc b/src/config/__pycache__/vector_store.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..147c1ffea35a694d713de19dabcb2bb7b6b5c2ca Binary files /dev/null and b/src/config/__pycache__/vector_store.cpython-311.pyc differ diff --git a/src/config/llm.py b/src/config/llm.py new file mode 100644 index 0000000000000000000000000000000000000000..09d6a50ce53611a33a27866a2a70938f30f28432 --- /dev/null +++ b/src/config/llm.py @@ -0,0 +1,67 @@ +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings +from src.utils.logger import logger +from langchain_core.language_models.chat_models import BaseChatModel + +# Default model instances +llm_2_0 = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=1) +llm_2_5_flash_preview = ChatGoogleGenerativeAI( + model="gemini-2.5-flash-preview-05-20", temperature=1, thinking_budget=None +) +llm_2_0_flash_lite = ChatGoogleGenerativeAI( + model="gemini-2.0-flash-lite", temperature=1 +) +# Default embeddings model +embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") + + +def get_llm( + model_name: str = "gemini-2.0-flash", + api_key: str = None, + include_thoughts: bool = False, + reasoning: bool = False, +) -> BaseChatModel: + """ + Get LLM instance based on model name and optional API key. + + Args: + model_name: Name of the model to use + api_key: Optional API key for authentication + + Returns: + Configured ChatGoogleGenerativeAI instance + + Raises: + ValueError: If model name is not supported + """ + if api_key: + logger.warning("Using custom API key") + if model_name == "gemini-2.0-flash": + return ChatGoogleGenerativeAI( + model=model_name, + temperature=1, + google_api_key=api_key, + ) + elif model_name == "gemini-2.5-flash-preview-05-20": + return ChatGoogleGenerativeAI( + model=model_name, + temperature=1, + google_api_key=api_key, + include_thoughts=include_thoughts, + thinking_budget=None if reasoning else 0, + ) + elif model_name == "gemini-2.0-flash-lite": + return ChatGoogleGenerativeAI( + model=model_name, + temperature=1, + google_api_key=api_key, + ) + + if model_name == "gemini-2.0-flash": + return llm_2_0 + elif model_name == "gemini-2.5-flash-preview-05-20": + return llm_2_5_flash_preview + elif model_name == "gemini-2.0-flash-lite": + return llm_2_0_flash_lite + + raise ValueError(f"Unknown model: {model_name}") diff --git a/src/config/mongo.py b/src/config/mongo.py new file mode 100644 index 0000000000000000000000000000000000000000..e654bf7c55ab566c869e4780ac8d03670784c130 --- /dev/null +++ b/src/config/mongo.py @@ -0,0 +1,190 @@ +from src.utils.logger import logger +from motor.motor_asyncio import AsyncIOMotorClient +from pydantic import BaseModel +from typing import Type, Dict, List, Optional +from bson import ObjectId +from motor.motor_asyncio import AsyncIOMotorCollection +from datetime import datetime, timezone, timedelta +from src.utils.logger import get_date_time +import os + +client: AsyncIOMotorClient = AsyncIOMotorClient(os.getenv("MONGO_CONNECTION_STR")) +# database = client["custom_gpt"] +database = client["ai_store"] + + +class MongoCRUD: + def __init__( + self, + collection: AsyncIOMotorCollection, + model: Type[BaseModel], + ttl_seconds: Optional[int] = None, + ): + self.collection = collection + self.model = model + self.ttl_seconds = ttl_seconds + self._index_created = False + + async def _ensure_ttl_index(self): + """Ensure TTL index exists""" + if self.ttl_seconds is not None and not self._index_created: + await self.collection.create_index("expire_at", expireAfterSeconds=0) + self._index_created = True + + def _order_fields(self, doc: Dict) -> Dict: + """Order fields in the document to ensure created_at and updated_at are at the end.""" + ordered_doc = { + k: doc[k] for k in doc if k not in ["created_at", "updated_at", "expire_at"] + } + if "id" in doc: + ordered_doc["_id"] = ObjectId(doc["id"]) + if "created_at" in doc: + ordered_doc["created_at"] = doc["created_at"] + if "updated_at" in doc: + ordered_doc["updated_at"] = doc["updated_at"] + if "expire_at" in doc: + ordered_doc["expire_at"] = doc["expire_at"] + return ordered_doc + + async def create(self, data: Dict) -> str: + """Create a new document in the collection asynchronously, optionally using a user-specified ID.""" + await self._ensure_ttl_index() + now = get_date_time().replace(tzinfo=None) + data["created_at"] = now + data["updated_at"] = now + if self.ttl_seconds is not None: + data["expire_at"] = now + timedelta(seconds=self.ttl_seconds) + document = self.model(**data).model_dump(exclude_unset=True) + ordered_document = self._order_fields(document) + result = await self.collection.insert_one(ordered_document) + return str(result.inserted_id) + + async def read(self, query: Dict, sort: List[tuple] = None) -> List[Dict]: + """Read documents from the collection based on a query asynchronously.""" + cursor = self.collection.find(query) + + # Apply sorting if provided + if sort: + cursor = cursor.sort(sort) + + docs = [] + async for doc in cursor: + docs.append( + { + "_id": str(doc["_id"]), + **self._order_fields(self.model(**doc).model_dump(exclude={"id"})), + } + ) + return docs + + async def read_one(self, query: Dict) -> Optional[Dict]: + """Read a single document from the collection based on a query asynchronously.""" + doc = await self.collection.find_one(query) + if doc: + doc["_id"] = str(doc["_id"]) + return { + "_id": doc["_id"], + **self._order_fields(self.model(**doc).model_dump(exclude={"id"})), + } + return None + + async def update(self, query: Dict, data: Dict, upsert: bool = False) -> int: + await self._ensure_ttl_index() + + if any(key.startswith("$") for key in data.keys()): + update_data = data + else: + data["updated_at"] = get_date_time().replace(tzinfo=None) + if self.ttl_seconds is not None: + data["expire_at"] = data["updated_at"] + timedelta( + seconds=self.ttl_seconds + ) + update_data = { + "$set": self._order_fields( + self.model(**data).model_dump(exclude_unset=True) + ) + } + + result = await self.collection.update_many(query, update_data, upsert=upsert) + return result.modified_count + + async def delete(self, query: Dict) -> int: + """Delete documents from the collection based on a query asynchronously.""" + result = await self.collection.delete_many(query) + return result.deleted_count + + async def delete_one(self, query: Dict) -> int: + """Delete a single document from the collection based on a query asynchronously.""" + result = await self.collection.delete_one(query) + return result.deleted_count + + async def find_by_id(self, id: str) -> Optional[Dict]: + """Find a document by its ID asynchronously.""" + return await self.read_one({"_id": ObjectId(id)}) + + async def find_all(self) -> List[Dict]: + """Find all documents in the collection asynchronously.""" + return await self.read({}) + + async def find_many( + self, filter: Dict, skip: int = 0, limit: int = 0, sort: List[tuple] = None + ) -> List[Dict]: + """ + Find documents based on filter with pagination support. + + Args: + filter: MongoDB query filter + skip: Number of documents to skip + limit: Maximum number of documents to return (0 means no limit) + sort: Optional sorting parameters [(field_name, direction)] + where direction is 1 for ascending, -1 for descending + + Returns: + List of documents matching the filter + """ + cursor = self.collection.find(filter) + + # Apply pagination + if skip > 0: + cursor = cursor.skip(skip) + if limit > 0: + cursor = cursor.limit(limit) + + # Apply sorting if provided + if sort: + cursor = cursor.sort(sort) + + docs = [] + async for doc in cursor: + # Convert _id to string and prepare document + doc_id = str(doc["_id"]) + doc_copy = {**doc} + doc_copy["_id"] = doc_id + + # Process through model validation + try: + validated_doc = self.model(**doc_copy).model_dump(exclude={"id"}) + docs.append({"_id": doc_id, **self._order_fields(validated_doc)}) + except Exception as e: + logger.error(f"Error validating document {doc_id}: {str(e)}") + # Include document even if validation fails, but with original data + docs.append( + {"_id": doc_id, **{k: v for k, v in doc.items() if k != "_id"}} + ) + + return docs + + +from src.apis.models.bot_models import Bot +from src.apis.models.user_models import User +from src.apis.models.grade_models import GradedAssignment +from src.apis.models.service_provide import ServiceProvider +from src.apis.models.category_models import Category +from src.apis.models.order_models import Order + +bot_crud = MongoCRUD(database["bot"], Bot) +UserCRUD = MongoCRUD(database["user"], User) +GradedAssignmentCRUD = MongoCRUD(database["graded_assignments"], GradedAssignment) +ServiceCRUD = MongoCRUD(database["services"], ServiceProvider) +CategoryCRUD = MongoCRUD(database["categories"], Category) +OrderCRUD = MongoCRUD(database["orders"], Order) diff --git a/src/utils/__pycache__/helper.cpython-311.pyc b/src/utils/__pycache__/helper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4891c19e2e923f259bf53057a396af05bf89fb1 Binary files /dev/null and b/src/utils/__pycache__/helper.cpython-311.pyc differ diff --git a/src/utils/__pycache__/logger.cpython-311.pyc b/src/utils/__pycache__/logger.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be8b17aaba0aca77b132ab00da474a1523445db0 Binary files /dev/null and b/src/utils/__pycache__/logger.cpython-311.pyc differ diff --git a/src/utils/helper.py b/src/utils/helper.py new file mode 100644 index 0000000000000000000000000000000000000000..76a86bbc4d909d4ad4d18dd728bb2959548088be --- /dev/null +++ b/src/utils/helper.py @@ -0,0 +1,129 @@ +from langchain_core.documents import Document +from typing import Union, Dict, Any +from langchain_core.messages import BaseMessage, trim_messages +from langchain_core.runnables import RunnableLambda +from langgraph.prebuilt import ToolNode +from langchain_core.messages import ToolMessage +import base64 +from fastapi import UploadFile +from typing import TypeVar +from youtube_transcript_api import YouTubeTranscriptApi +from youtube_comment_downloader import YoutubeCommentDownloader +from src.utils.logger import logger + +State = TypeVar("State", bound=Dict[str, Any]) + + +def fake_token_counter(messages: Union[list[BaseMessage], BaseMessage]) -> int: + if isinstance(messages, list): + return sum(len(str(message.content).split()) for message in messages) + return len(str(messages.content).split()) + + +def convert_list_context_source_to_str(contexts: list[Document]): + formatted_str = "" + for i, context in enumerate(contexts): + formatted_str += f"Document index {i}:\nContent: {context.page_content}\n" + formatted_str += "----------------------------------------------\n\n" + return formatted_str + + +def trim_messages_function(messages: list[BaseMessage], max_tokens: int = 100000): + if len(messages) <= 1: + return messages + messages = trim_messages( + messages, + strategy="last", + token_counter=fake_token_counter, + max_tokens=max_tokens, + start_on="human", + # end_on="ai", + include_system=False, + allow_partial=False, + ) + return messages + + +def create_tool_node_with_fallback(tools: list) -> dict: + return ToolNode(tools).with_fallbacks( + [RunnableLambda(handle_tool_error)], exception_key="error" + ) + + +def handle_tool_error(state: State) -> dict: + error = state.get("error") + tool_messages = state["messages"][-1] + return { + "messages": [ + ToolMessage( + content=f"Error: {repr(error)}\n please fix your mistakes.", + tool_call_id=tc["id"], + ) + for tc in tool_messages.tool_calls + ] + } + + +async def preprocess_messages(query: str, attachs: list[UploadFile]): + messages: dict[str, list[dict]] = { + "role": "user", + "content": [], + } + if query: + messages["content"].append( + { + "type": "text", + "text": query, + } + ) + if attachs: + for attach in attachs: + if ( + attach.content_type == "image/jpeg" + or attach.content_type == "image/png" + ): + content = await attach.read() + encoded_string = base64.b64encode(content).decode("utf-8") + messages["content"].append( + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{encoded_string}", + }, + } + ) + if attach.content_type == "application/pdf": + content = await attach.read() + encoded_string = base64.b64encode(content).decode("utf-8") + messages["content"].append( + { + "type": "file", + "source_type": "base64", + "mime_type": "application/pdf", + "data": f"{encoded_string}", + "citations": {"enabled": True}, + } + ) + return messages + + +def extract_transcript(video_link: str): + ytt_api = YouTubeTranscriptApi() + # extract video id from video link + video_id = video_link.split("v=")[1] + transcript = ytt_api.fetch(video_id) + transcript_str = "" + for trans in transcript: + transcript_str += trans.text + " " + logger.info(f"Transcript: {transcript_str}") + return transcript_str + + +def extract_comment(video_link: str): + ytd_api = YoutubeCommentDownloader() + comments = ytd_api.get_comments_from_url(video_link) + comments_str = "" + for comment in comments: + comments_str += comment["text"] + " " + logger.info(f"Comments: {comments_str}") + return comments_str diff --git a/src/utils/logger.py b/src/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..287621a82e032760b7722c1d1c7b04a031100808 --- /dev/null +++ b/src/utils/logger.py @@ -0,0 +1,71 @@ +from datetime import datetime +import pytz +import logging +import os +from datetime import datetime +from pathlib import Path + + + +def get_date_time(): + return datetime.now(pytz.timezone("Asia/Ho_Chi_Minh")) + + + +class CoreCFG: + PROJECT_NAME = "SCHEDULE AI" + BOT_NAME = str("SCHEDULE AI") + + +def get_date_time(): + return datetime.now(pytz.timezone("Asia/Ho_Chi_Minh")) + + +DATE_TIME = get_date_time().date() +BASE_DIR = os.path.dirname(Path(__file__).parent.parent) +LOG_DIR = os.path.join(BASE_DIR, "logs") + + +class CustomFormatter(logging.Formatter): + green = "\x1b[0;32m" + grey = "\x1b[38;5;248m" + yellow = "\x1b[38;5;229m" + red = "\x1b[31;20m" + bold_red = "\x1b[31;1m" + blue = "\x1b[38;5;31m" + white = "\x1b[38;5;255m" + reset = "\x1b[38;5;15m" + + base_format = f"{grey}%(asctime)s | %(name)s | %(threadName)s | {{level_color}}%(levelname)-8s{grey} | {blue}%(module)s:%(lineno)d{grey} - {white}%(message)s" + + FORMATS = { + logging.INFO: base_format.format(level_color=green), + logging.WARNING: base_format.format(level_color=yellow), + logging.ERROR: base_format.format(level_color=red), + logging.CRITICAL: base_format.format(level_color=bold_red), + } + + def format(self, record): + log_fmt = self.FORMATS.get(record.levelno) + formatter = logging.Formatter(log_fmt) + return formatter.format(record) + + +def custom_logger(app_name="APP"): + logger_r = logging.getLogger(name=app_name) + # Set the timezone to Ho_Chi_Minh + tz = pytz.timezone("Asia/Ho_Chi_Minh") + + logging.Formatter.converter = lambda *args: datetime.now(tz).timetuple() + + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + ch.setFormatter(CustomFormatter()) + + logger_r.setLevel(logging.INFO) + logger_r.addHandler(ch) + + return logger_r + + +logger = custom_logger(app_name=CoreCFG.PROJECT_NAME) \ No newline at end of file