|
from __future__ import annotations |
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type |
|
import logging |
|
import json |
|
import os |
|
from datetime import datetime |
|
import hashlib |
|
import csv |
|
import requests |
|
import re |
|
import html |
|
import markdown2 |
|
import torch |
|
import sys |
|
import gc |
|
from pygments.lexers import guess_lexer, ClassNotFound |
|
import time |
|
import json |
|
import operator |
|
from typing import Annotated, Sequence, TypedDict |
|
import pprint |
|
|
|
import gradio as gr |
|
from pypinyin import lazy_pinyin |
|
import tiktoken |
|
import mdtex2html |
|
from markdown import markdown |
|
from pygments import highlight |
|
from pygments.lexers import guess_lexer,get_lexer_by_name |
|
from pygments.formatters import HtmlFormatter |
|
|
|
from langchain.chains import LLMChain, RetrievalQA |
|
from langgraph.graph import END, StateGraph |
|
from langchain_openai import ChatOpenAI |
|
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader, UnstructuredWordDocumentLoader, DirectoryLoader |
|
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader |
|
from langchain.document_loaders.generic import GenericLoader |
|
from langchain.document_loaders.parsers import OpenAIWhisperParser |
|
from langchain.schema import AIMessage, HumanMessage |
|
from langchain_community.llms import HuggingFaceHub |
|
from langchain_community.llms import HuggingFaceTextGenInference |
|
from langchain_community.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings |
|
from langchain_community.tools import DuckDuckGoSearchRun |
|
from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever |
|
from typing import Dict, TypedDict |
|
from langchain_core.messages import BaseMessage |
|
from langchain_openai import OpenAIEmbeddings |
|
from langchain.prompts import PromptTemplate |
|
|
|
|
|
from langchain import hub |
|
from langchain.output_parsers.openai_tools import PydanticToolsParser |
|
from langchain.prompts import PromptTemplate |
|
from langchain.schema import Document |
|
from langchain_community.tools.tavily_search import TavilySearchResults |
|
from langchain_community.vectorstores import Chroma |
|
from langchain_core.messages import BaseMessage, FunctionMessage |
|
from langchain_core.output_parsers import StrOutputParser |
|
from langchain_core.pydantic_v1 import BaseModel, Field |
|
from langchain_core.runnables import RunnablePassthrough |
|
from langchain_core.utils.function_calling import convert_to_openai_tool |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.vectorstores import Chroma |
|
from chromadb.errors import InvalidDimensionException |
|
import io |
|
from PIL import Image, ImageDraw, ImageOps, ImageFont |
|
import base64 |
|
from tempfile import NamedTemporaryFile |
|
|
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
nltk.download('punkt') |
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import numpy as np |
|
|
|
from reportlab.lib.pagesizes import inch, A4 |
|
from reportlab.platypus import SimpleDocTemplate, Frame, Spacer |
|
from reportlab.lib import colors |
|
from reportlab.lib.units import mm |
|
from reportlab.platypus import Paragraph, SimpleDocTemplate, Frame, Image, Table, ListFlowable, ListItem |
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle |
|
from reportlab.lib.units import cm |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
ANTWORT_WEISS_NICHT = ["ich weiß nicht.", "ich weiß das nicht", "Ich habe dazu keine Antwort", "Ich bin nicht sicher", "Ich kann das nicht beantworten", "Es tut mir leid, aber ich kenne keinen", "Es tut mir leid, aber ich kann die Frage nicht beantworten.", "Es tut mir leid, aber ich kann die Frage nicht beantworten, da ich zu der Frage keine spezifischen Informatioen habe"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
template = """\Antworte in deutsch, wenn es nicht explizit anders gefordert wird. Wenn du die Antwort nicht kennst, antworte direkt, dass du es nicht weißt. |
|
Versuche nicht es zu umschreiben. Versuche nicht, die Antwort zu erfinden oder aufzumocken. Halte die Antwort kurz aber ausführlich genug und exakt.""" |
|
|
|
llm_template = "Beantworte die Frage am Ende. " + template + "Frage: {question} " |
|
|
|
llm_template2 = "Fasse folgenden Text als Überschrift mit maximal 3 Worten zusammen. Text: {question} " |
|
|
|
rag_template = "Nutze die folgenden Kontexte (Beginnend mit dem Wort 'Kontext:') aus Teilen aus den angehängten Dokumenten, um die Frage (Beginnend mit dem Wort 'Frage: ') am Ende zu beantworten. Wenn du die Frage aus dem folgenden Kontext nicht beantworten kannst, dann versuche eine Beantwortung aus deinen eigenen trainierten Daten zu finden. Mache das kenntlich, ob du dich auf den hier angehängten Kontext beziehst oder ob du anhand deiner Daten antwortest. Wenn du dich auf den angegebenen Kontext beziehst, gib unbedingt den Namen des Dokumentes an, auf den du dich beziehst." + template + "Kontext: {context} Frage: {question}" |
|
|
|
|
|
|
|
LLM_CHAIN_PROMPT = PromptTemplate(input_variables = ["question"], |
|
template = llm_template) |
|
|
|
LLM_CHAIN_PROMPT2 = PromptTemplate(input_variables = ["question"], |
|
template = llm_template2) |
|
|
|
RAG_CHAIN_PROMPT = PromptTemplate(input_variables = ["context", "question"], |
|
template = rag_template) |
|
|
|
|
|
|
|
|
|
PATH_WORK = "." |
|
CHROMA_DIR = "/chroma/kkg" |
|
CHROMA_PDF = './chroma/kkg/pdf' |
|
CHROMA_WORD = './chroma/kkg/word' |
|
CHROMA_EXCEL = './chroma/kkg/excel' |
|
YOUTUBE_DIR = "/youtube" |
|
HISTORY_PFAD = "/data/history" |
|
|
|
|
|
|
|
PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf" |
|
WEB_URL = "https://openai.com/research/gpt-4" |
|
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE" |
|
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE" |
|
|
|
|
|
urls = [ |
|
"https://kkg.hamburg.de/unser-leitbild/" |
|
"https://kkg.hamburg.de/unsere-schulcharta/", |
|
"https://kkg.hamburg.de/koordination-unterrichtsentwicklung/", |
|
"https://kkg.hamburg.de/konzept-medien-und-it-am-kkg/", |
|
] |
|
|
|
|
|
|
|
|
|
|
|
def is_response_similar(response, threshold=0.7): |
|
if (len(response) < 160): |
|
|
|
combined_responses = ANTWORT_WEISS_NICHT + [response] |
|
|
|
|
|
vectorizer = TfidfVectorizer() |
|
tfidf_matrix = vectorizer.fit_transform(combined_responses) |
|
|
|
|
|
cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]) |
|
|
|
|
|
if np.max(cosine_similarities) > threshold: |
|
return True |
|
return False |
|
return False |
|
|
|
|
|
|
|
|
|
|
|
def normalise_prompt (prompt): |
|
|
|
prompt_klein =prompt.lower() |
|
|
|
tokens = word_tokenize(prompt_klein) |
|
|
|
tokens = [word for word in tokens if word.isalnum()] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokens = [re.sub(r'\W+', '', word) for word in tokens] |
|
|
|
|
|
|
|
|
|
|
|
normalized_prompt = ' '.join(tokens) |
|
print("normaiserd prompt..................................") |
|
print(normalized_prompt) |
|
return normalized_prompt |
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_directory_loader(file_type, directory_path): |
|
|
|
loaders = { |
|
'.pdf': PyPDFLoader, |
|
'.word': UnstructuredWordDocumentLoader, |
|
} |
|
return DirectoryLoader( |
|
path=directory_path, |
|
glob=f"**/*{file_type}", |
|
loader_cls=loaders[file_type], |
|
) |
|
|
|
|
|
def document_loading_splitting(): |
|
|
|
|
|
docs = [] |
|
|
|
|
|
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF) |
|
word_loader = create_directory_loader('.word', CHROMA_WORD) |
|
|
|
|
|
pdf_documents = pdf_loader.load() |
|
word_documents = word_loader.load() |
|
|
|
|
|
docs_web = [WebBaseLoader(url).load() for url in urls] |
|
docs_list = [item for sublist in docs_web for item in sublist] |
|
|
|
|
|
|
|
pdf_list = [pdf_documents] |
|
word_list = [word_documents] |
|
|
|
|
|
for doc in pdf_list: |
|
docs_list.extend(doc) |
|
for doc in word_list: |
|
docs_list.extend(doc) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap=250) |
|
doc_splits = text_splitter.split_documents(docs_list) |
|
|
|
|
|
|
|
return doc_splits |
|
|
|
|
|
|
|
def document_storage_chroma(splits): |
|
|
|
vectorstore = Chroma.from_documents(documents = splits, embedding = OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR) |
|
|
|
retriever = vectorstore.as_retriever(search_kwargs = {"k": 5}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
return vectorstore, retriever |
|
|
|
|
|
def grade_documents_direct(prompt, documents): |
|
print("---CHECK RELEVANCE---") |
|
|
|
|
|
class grade(BaseModel): |
|
|
|
binary_score: str = Field(description="Relevanz Bewertung 'ja' oder 'nein'") |
|
|
|
|
|
model = ChatOpenAI(temperature=0.3, model="gpt-3.5-turbo-1106", streaming=True) |
|
|
|
|
|
grade_tool_oai = convert_to_openai_tool(grade) |
|
|
|
|
|
llm_with_tool = model.bind( |
|
tools=[convert_to_openai_tool(grade_tool_oai)], |
|
tool_choice={"type": "function", "function": {"name": "grade"}}, |
|
) |
|
|
|
|
|
parser_tool = PydanticToolsParser(tools=[grade]) |
|
|
|
|
|
prompt_gesamt = PromptTemplate( |
|
template="""Du bist ein Bewerter, der die Relevanz von einem erhaltenen Dokument zu einer Nutzeranfrage bewerten soll. \n |
|
Hier ist das erhaltene Dokument (Kontext): \n\n {context} \n\n |
|
Hier ist die Nutzeranfrage (Frage): {question} \n |
|
Wenn das erhaltene Dokument Keywörter oder semantische Bedeutung in Bezug auf die Nutzeranfrage hat, bewerte es als relevant. \n |
|
Gib eine binäre Bewertung von 'ja' oder 'nein' Bewertung, um anzuzeigen ob das Dokuemnt relevant ist zur Nutzeranfrage oder nicht.""", |
|
input_variables=["context", "question"], |
|
) |
|
|
|
|
|
chain = prompt_gesamt | llm_with_tool | parser_tool |
|
|
|
|
|
filtered_docs = [] |
|
for d in documents: |
|
|
|
score = chain.invoke({"question": prompt, "context": d.page_content}) |
|
grade = score[0].binary_score |
|
if grade == "ja": |
|
print("---Bewertung: Dokument ist relevant---") |
|
filtered_docs.append(d) |
|
else: |
|
print("---Bewertung: Dokument irrelevant---") |
|
continue |
|
|
|
return filtered_docs |
|
|
|
|
|
def transform_query_direct(question): |
|
print("---TRANSFORM QUERY---") |
|
|
|
|
|
prompt = PromptTemplate( |
|
template="""Du generierst Fragen, die optimiert sind für das Retrieval von Dokumenten. \n |
|
Schaue auf den input und versuche die zugrundeliegende Absicht / Bedeutung zu bewerten. \n |
|
Hier ist die ursprüngliche Frage: |
|
\n ------- \n |
|
{question} |
|
\n ------- \n |
|
Formuliere eine verbesserte Frage: """, |
|
input_variables=["question"], |
|
) |
|
|
|
|
|
model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview", streaming=True) |
|
|
|
|
|
chain = prompt | model | StrOutputParser() |
|
better_question = chain.invoke({"question": question}) |
|
|
|
return better_question |
|
|
|
|
|
|
|
|
|
|
|
|
|
def llm_chain(llm, prompt): |
|
llm_chain = LLMChain(llm = llm, prompt = LLM_CHAIN_PROMPT) |
|
result = llm_chain.run({"question": prompt}) |
|
return result |
|
|
|
|
|
def llm_chain2(llm, prompt): |
|
llm_chain = LLMChain(llm = llm, prompt = LLM_CHAIN_PROMPT2) |
|
result = llm_chain.run({"question": prompt}) |
|
return result |
|
|
|
|
|
def rag_chain(llm, prompt, retriever): |
|
|
|
relevant_docs=[] |
|
filtered_docs=[] |
|
relevant_docs = retriever.get_relevant_documents(prompt) |
|
print("releant docs1......................") |
|
print(relevant_docs) |
|
if (len(relevant_docs)>0): |
|
filtered_docs = grade_documents_direct(prompt, relevant_docs) |
|
|
|
neu_prompt=prompt |
|
if (len(filtered_docs)<2): |
|
relevant_docs=[] |
|
neu_prompt = transform_query_direct(prompt) |
|
relevant_docs = retriever.get_relevant_documents(neu_prompt) |
|
if (len(relevant_docs)>0): |
|
print("releant docs2......................") |
|
print(relevant_docs) |
|
filtered_docs = grade_documents_direct(neu_prompt, relevant_docs) |
|
|
|
if (len(filtered_docs)>0): |
|
llm_chain = LLMChain(llm = llm, prompt = RAG_CHAIN_PROMPT) |
|
result = llm_chain.run({"context": filtered_docs, "question": neu_prompt}) |
|
else: |
|
|
|
llm_chain = LLMChain(llm = llm, prompt = LLM_CHAIN_PROMPT) |
|
result = llm_chain.run({"question": neu_prompt}) |
|
return result |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rag_chain2(prompt, db, k=3): |
|
rag_template = "Nutze die folgenden Kontext Teile am Ende, um die Frage zu beantworten . " + template + "Frage: " + prompt + "Kontext Teile: " |
|
retrieved_chunks = db.similarity_search(prompt, k) |
|
|
|
neu_prompt = rag_template |
|
for i, chunk in enumerate(retrieved_chunks): |
|
neu_prompt += f"{i+1}. {chunk}\n" |
|
|
|
return neu_prompt |
|
|
|
|
|
|
|
|
|
|
|
def generate_prompt_with_history(text, history, max_length=4048): |
|
|
|
|
|
prompt="" |
|
history = ["\n{}\n{}".format(x[0],x[1]) for x in history] |
|
history.append("\n{}\n".format(text)) |
|
history_text = "" |
|
flag = False |
|
for x in history[::-1]: |
|
history_text = x + history_text |
|
flag = True |
|
print("hist+prompt: ") |
|
print(history_text) |
|
if flag: |
|
return prompt+history_text |
|
else: |
|
return None |
|
|
|
|
|
|
|
|
|
def generate_prompt_with_history_openai(prompt, history): |
|
history_openai_format = [] |
|
for human, assistant in history: |
|
history_openai_format.append({"role": "user", "content": human }) |
|
history_openai_format.append({"role": "assistant", "content":assistant}) |
|
|
|
history_openai_format.append({"role": "user", "content": prompt}) |
|
print("openai history und prompt................") |
|
print(history_openai_format) |
|
return history_openai_format |
|
|
|
|
|
|
|
def generate_prompt_with_history_hf(prompt, history): |
|
history_transformer_format = history + [[prompt, ""]] |
|
|
|
|
|
messages = "".join(["".join(["\n<human>:"+item[0], "\n<bot>:"+item[1]]) |
|
for item in history_transformer_format]) |
|
|
|
|
|
|
|
def generate_prompt_with_history_langchain(prompt, history): |
|
history_langchain_format = [] |
|
for human, ai in history: |
|
history_langchain_format.append(HumanMessage(content=human)) |
|
history_langchain_format.append(AIMessage(content=ai)) |
|
history_langchain_format.append(HumanMessage(content=prompt)) |
|
|
|
return history_langchain_format |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_image(image_path, prompt, model_image, oai_key): |
|
|
|
with open(image_path, "rb") as image_file: |
|
encoded_string = base64.b64encode(image_file.read()).decode('utf-8') |
|
|
|
|
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {oai_key}" |
|
} |
|
payload = { |
|
"model": model_image, |
|
"messages": [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": llm_template + prompt |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/jpeg;base64,{encoded_string}" |
|
} |
|
} |
|
] |
|
} |
|
], |
|
"max_tokens": 300 |
|
} |
|
return headers, payload |
|
|
|
|
|
def process_chatverlauf(prompt, model, oai_key): |
|
|
|
if (len(prompt)>50): |
|
prompt = prompt[:50] |
|
|
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {oai_key}" |
|
} |
|
payload = { |
|
"model": model, |
|
"messages": [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": 'Gib folgendem Text eine Überschrift mit maximal 2 Worten' + prompt |
|
}, |
|
] |
|
} |
|
], |
|
"max_tokens": 100 |
|
} |
|
return headers, payload |
|
|
|
def process_chatverlauf_hf(history, llm): |
|
input = generate_prompt_with_history("Gib folgendem Text eine Überschrift mit maximal 3 Worten", history) |
|
result = llm_chain2(llm, input) |
|
return result |
|
|
|
|
|
|
|
def save_and_download(chat_history): |
|
|
|
with NamedTemporaryFile(delete=False, mode="w", suffix=".txt", dir="./temp") as tmp: |
|
temp_file_path = tmp.name |
|
tmp.write(chat_history) |
|
return temp_file_path |
|
|
|
def cleanup(file_path): |
|
if os.path.exists(file_path): |
|
os.remove(file_path) |
|
|
|
|
|
|
|
|
|
|
|
def markdown_to_html_with_syntax_highlight(md_str): |
|
def replacer(match): |
|
lang = match.group(1) or "text" |
|
code = match.group(2) |
|
lang = lang.strip() |
|
|
|
if lang=="text": |
|
lexer = guess_lexer(code) |
|
lang = lexer.name |
|
|
|
try: |
|
lexer = get_lexer_by_name(lang, stripall=True) |
|
except ValueError: |
|
lexer = get_lexer_by_name("python", stripall=True) |
|
formatter = HtmlFormatter() |
|
|
|
highlighted_code = highlight(code, lexer, formatter) |
|
|
|
return f'<pre><code class="{lang}">{highlighted_code}</code></pre>' |
|
|
|
code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```" |
|
md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE) |
|
|
|
html_str = markdown(md_str) |
|
return html_str |
|
|
|
|
|
def normalize_markdown(md_text: str) -> str: |
|
lines = md_text.split("\n") |
|
normalized_lines = [] |
|
inside_list = False |
|
|
|
for i, line in enumerate(lines): |
|
if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()): |
|
if not inside_list and i > 0 and lines[i - 1].strip() != "": |
|
normalized_lines.append("") |
|
inside_list = True |
|
normalized_lines.append(line) |
|
elif inside_list and line.strip() == "": |
|
if i < len(lines) - 1 and not re.match( |
|
r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip() |
|
): |
|
normalized_lines.append(line) |
|
continue |
|
else: |
|
inside_list = False |
|
normalized_lines.append(line) |
|
|
|
return "\n".join(normalized_lines) |
|
|
|
|
|
def convert_mdtext(md_text): |
|
code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL) |
|
inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL) |
|
code_blocks = code_block_pattern.findall(md_text) |
|
non_code_parts = code_block_pattern.split(md_text)[::2] |
|
|
|
result = [] |
|
for non_code, code in zip(non_code_parts, code_blocks + [""]): |
|
if non_code.strip(): |
|
non_code = normalize_markdown(non_code) |
|
if inline_code_pattern.search(non_code): |
|
result.append(markdown(non_code, extensions=["tables"])) |
|
else: |
|
result.append(mdtex2html.convert(non_code, extensions=["tables"])) |
|
if code.strip(): |
|
code = f"\n```{code}\n\n```" |
|
code = markdown_to_html_with_syntax_highlight(code) |
|
result.append(code) |
|
result = "".join(result) |
|
result += ALREADY_CONVERTED_MARK |
|
return result |
|
|
|
def convert_asis(userinput): |
|
return f"<p style=\"white-space:pre-wrap;\">{html.escape(userinput)}</p>"+ALREADY_CONVERTED_MARK |
|
|
|
def detect_converted_mark(userinput): |
|
if userinput.endswith(ALREADY_CONVERTED_MARK): |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
|
|
def detect_language(code): |
|
if code.startswith("\n"): |
|
first_line = "" |
|
else: |
|
first_line = code.strip().split("\n", 1)[0] |
|
language = first_line.lower() if first_line else "" |
|
code_without_language = code[len(first_line) :].lstrip() if first_line else code |
|
return language, code_without_language |
|
|
|
def convert_to_markdown(text): |
|
text = text.replace("$","$") |
|
def replace_leading_tabs_and_spaces(line): |
|
new_line = [] |
|
|
|
for char in line: |
|
if char == "\t": |
|
new_line.append("	") |
|
elif char == " ": |
|
new_line.append(" ") |
|
else: |
|
break |
|
return "".join(new_line) + line[len(new_line):] |
|
|
|
markdown_text = "" |
|
lines = text.split("\n") |
|
in_code_block = False |
|
|
|
for line in lines: |
|
if in_code_block is False and line.startswith("```"): |
|
in_code_block = True |
|
markdown_text += f"{line}\n" |
|
elif in_code_block is True and line.startswith("```"): |
|
in_code_block = False |
|
markdown_text += f"{line}\n" |
|
elif in_code_block: |
|
markdown_text += f"{line}\n" |
|
else: |
|
line = replace_leading_tabs_and_spaces(line) |
|
line = re.sub(r"^(#)", r"\\\1", line) |
|
markdown_text += f"{line} \n" |
|
|
|
return markdown_text |
|
|
|
def add_language_tag(text): |
|
def detect_language(code_block): |
|
try: |
|
lexer = guess_lexer(code_block) |
|
return lexer.name.lower() |
|
except ClassNotFound: |
|
return "" |
|
|
|
code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE) |
|
|
|
def replacement(match): |
|
code_block = match.group(2) |
|
if match.group(2).startswith("\n"): |
|
language = detect_language(code_block) |
|
if language: |
|
return f"```{language}{code_block}```" |
|
else: |
|
return f"```\n{code_block}```" |
|
else: |
|
return match.group(1) + code_block + "```" |
|
|
|
text2 = code_block_pattern.sub(replacement, text) |
|
return text2 |
|
|
|
def delete_last_conversation(chatbot, history): |
|
if len(chatbot) > 0: |
|
chatbot.pop() |
|
|
|
if len(history) > 0: |
|
history.pop() |
|
|
|
return ( |
|
chatbot, |
|
history, |
|
"Delete Done", |
|
) |
|
|
|
def reset_state(): |
|
return [], [], "Reset Done" |
|
|
|
def reset_textbox(): |
|
return gr.update(value=""),"" |
|
|
|
def cancel_outputing(): |
|
return "Stop Done" |
|
|
|
|
|
|
|
|
|
def analyze_file(file): |
|
file_extension = file.name.split('.')[-1] |
|
return file_extension |
|
|
|
|
|
|
|
def get_filename(file_pfad): |
|
parts = file_pfad.rsplit('/', 1) |
|
if len(parts) == 2: |
|
result = parts[1] |
|
else: |
|
result = "Ein Fehler im Filenamen ist aufgetreten..." |
|
return result |
|
|
|
|
|
|
|
|
|
def submit_message(assistant_id, thread, client, user_message): |
|
client.beta.threads.messages.create( |
|
thread_id=thread.id, role="user", content=user_message |
|
) |
|
return client.beta.threads.runs.create( |
|
thread_id=thread.id, |
|
assistant_id=assistant_id, |
|
) |
|
|
|
def get_response(thread, client, assi_id): |
|
return client.beta.threads.messages.list(thread_id=thread.id, order="asc") |
|
|
|
def create_thread_and_run(user_input, client, assi_id): |
|
thread = client.beta.threads.create() |
|
run = submit_message(assi_id, thread, client, user_input) |
|
return thread, run |
|
|
|
def pretty_print(messages): |
|
print("# Messages") |
|
for m in messages: |
|
print(f"{m.role}: {m.content[0].text.value}") |
|
print() |
|
|
|
|
|
def wait_on_run(run, thread, client): |
|
while run.status == "queued" or run.status == "in_progress": |
|
run = client.beta.threads.runs.retrieve( |
|
thread_id=thread.id, |
|
run_id=run.id, |
|
) |
|
time.sleep(0.5) |
|
return run |
|
|
|
|
|
|
|
def tavily_search(tavily_client, query): |
|
search_result = tavily_client.get_search_context(query, search_depth="advanced", max_tokens=8000) |
|
return search_result |
|
|
|
|
|
|
|
def hugchat_search(chatbot, query): |
|
search_result = chatbot.query(query, web_search=True) |
|
|
|
|
|
|
|
|
|
return search_result.text, search_result.link |
|
|
|
|
|
|
|
def openai_assistant_suche(client): |
|
assistant = client.beta.assistants.create( |
|
instructions=template, |
|
model="gpt-4-1106-preview", |
|
tools=[{ |
|
"type": "function", |
|
"function": { |
|
"name": "tavily_search", |
|
"description": "Get information on recent events from the web.", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"query": {"type": "string", "description": "Die Suchanfrage, die die KI nicht beantworten konnte, hier hinein"}, |
|
}, |
|
"required": ["query"] |
|
} |
|
} |
|
}] |
|
) |
|
return assistant |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_picture(history, prompt): |
|
client = OpenAI() |
|
response = client.images.generate(model="dall-e-3", prompt=prompt,size="1024x1024",quality="standard",n=1,) |
|
image_url = response.data[0].url |
|
|
|
response2 = requests.get(image_url) |
|
|
|
image = Image.open(response2.raw) |
|
return image |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def verarbeite_text_und_aufzaehlungen(text, styles): |
|
|
|
aufzaehlungszeichen = ['-', '*', '•'] |
|
|
|
|
|
nummerierung_regex = r"^\s*\d+\.\s*" |
|
|
|
zeilen = text.split('\n') |
|
elements = [] |
|
list_items = [] |
|
paragraph_text = [] |
|
|
|
for zeile in zeilen: |
|
if any(zeile.lstrip().startswith(zeichen) for zeichen in aufzaehlungszeichen) or re.match(nummerierung_regex, zeile.lstrip()): |
|
|
|
if paragraph_text: |
|
elements.append(Paragraph(' '.join(paragraph_text), styles['BodyText'])) |
|
paragraph_text = [] |
|
|
|
|
|
if re.match(nummerierung_regex, zeile.lstrip()): |
|
cleaned_line = re.sub(nummerierung_regex, '', zeile.lstrip(), 1).lstrip() |
|
else: |
|
for zeichen in aufzaehlungszeichen: |
|
if zeile.lstrip().startswith(zeichen): |
|
cleaned_line = zeile.lstrip()[len(zeichen):].lstrip() |
|
break |
|
list_items.append(ListItem(Paragraph(cleaned_line, styles['BodyText']))) |
|
else: |
|
|
|
if list_items: |
|
|
|
elements.append(ListFlowable(list_items, bulletType='bullet', start='bulletchar', bulletFontName='Helvetica')) |
|
list_items = [] |
|
paragraph_text.append(zeile) |
|
|
|
|
|
if paragraph_text: |
|
elements.append(Paragraph(' '.join(paragraph_text), styles['BodyText'])) |
|
if list_items: |
|
elements.append(ListFlowable(list_items, bulletType='bullet', start='bulletchar', bulletFontName='Helvetica')) |
|
|
|
return elements |
|
|
|
|
|
|
|
|
|
def on_each_page(canvas, doc): |
|
page_width, page_height = A4 |
|
canvas.saveState() |
|
canvas.setFont('Times-Roman', 10) |
|
|
|
current_date = datetime.now().strftime("%Y-%m-%d") |
|
print(current_date) |
|
|
|
canvas.drawRightString(page_width - 72, page_height - 28, current_date) |
|
canvas.restoreState() |
|
|
|
|
|
def erstellePdf(file_path_download, ueberschrift, dic_history): |
|
|
|
elements = [] |
|
|
|
paper_size = A4 |
|
|
|
|
|
styles = getSampleStyleSheet() |
|
|
|
new_style = ParagraphStyle('NewStyle', fontName='Helvetica', fontSize=11) |
|
styles.add(new_style) |
|
|
|
line_style = ParagraphStyle('LineStyle', fontSize=4, leading=6, borderPadding=0, |
|
spaceBefore=0, spaceAfter=0, textColor='black') |
|
list_style = getSampleStyleSheet() |
|
|
|
|
|
|
|
title = Paragraph(ueberschrift, styles['Title']) |
|
headline_nutzer = Paragraph('Nutzer:', styles['Heading3']) |
|
headline_assi = Paragraph('Assistent:', styles['Heading3']) |
|
|
|
|
|
elements.append(title) |
|
for nutzer, assi in dic_history.items(): |
|
elements.append(headline_nutzer) |
|
p = Paragraph(nutzer, styles['NewStyle']) |
|
elements.append(p) |
|
|
|
elements.append(Spacer(1, 2*mm)) |
|
elements.append(headline_assi) |
|
element_check = verarbeite_text_und_aufzaehlungen(assi,list_style) |
|
|
|
for elem in element_check: |
|
if isinstance(elem, list): |
|
|
|
elements.extend(elem) |
|
else: |
|
|
|
elements.append(elem) |
|
|
|
|
|
elements.append(Spacer(1, 8*mm)) |
|
|
|
elements.append(Paragraph('_' * 100, line_style)) |
|
|
|
elements.append(Spacer(1, 8*mm)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
doc = CustomDocTemplate(file_path_download, pagesize=paper_size) |
|
|
|
doc.onPage = on_each_page |
|
doc.build(elements) |
|
|
|
|
|
|
|
|
|
def hash_input(input_string): |
|
return hashlib.sha256(input_string.encode()).hexdigest() |
|
|
|
|
|
|
|
|
|
def transfer_input(inputs): |
|
textbox = reset_textbox() |
|
return ( |
|
inputs, |
|
gr.update(value=""), |
|
gr.Button.update(visible=True), |
|
) |
|
|
|
|
|
|
|
|
|
|
|
class State: |
|
interrupted = False |
|
|
|
def interrupt(self): |
|
self.interrupted = True |
|
|
|
def recover(self): |
|
self.interrupted = False |
|
shared_state = State() |
|
|
|
|
|
|
|
|
|
def is_stop_word_or_prefix(s: str, stop_words: list) -> bool: |
|
for stop_word in stop_words: |
|
if s.endswith(stop_word): |
|
return True |
|
for i in range(1, len(stop_word)): |
|
if s.endswith(stop_word[:i]): |
|
return True |
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
class CustomDocTemplate(SimpleDocTemplate): |
|
def handle_pageBegin(self): |
|
|
|
self._handle_pageBegin() |
|
|
|
self.canv.saveState() |
|
self.canv.setFont('Helvetica', 10) |
|
current_date = datetime.now().strftime("%Y-%m-%d") |
|
|
|
self.canv.drawRightString(550, 800, current_date) |
|
self.canv.restoreState() |
|
|
|
|
|
|