Spaces:
Sleeping
Sleeping
import logging | |
import os | |
import warnings | |
import gradio as gr | |
import pandas as pd | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage | |
from langchain_upstage import ChatUpstage | |
from rapidfuzz import process | |
import pymupdf | |
warnings.filterwarnings("ignore") | |
UPSTAGE_API_KEY = os.environ["UPSTAGE_API_KEY"] | |
TITLE = "์ฌ์ด ์ ๋ฌธ์ฉ์ด ๋ฒ์ญ๊ธฐ" | |
DESCRIPTION = """ | |
์ฐ๋ฆฌ๊ฐ easyword.kr์ ๋ชจ์ ์ฌ์ด ์ ๋ฌธ์ฉ์ด๋ค์ ์ฌ์ฉํ์ฌ, ์๋์ผ๋ก ๋ฌธ์ฅ์ ๋ฒ์ญํด์ค๋๋ค. | |
๋ฒ์ญ์ ์๋์ผ๋ก ํ๊ธฐ ์ํด ์ ์คํ ์ด์ง์ Solar ๋ํ ์ธ์ด ๋ชจ๋ธ (large language model, LLM)์ ์ฌ์ฉํฉ๋๋ค. | |
์ฐ๋ฆฌ ๋ฒ์ญ๊ธฐ๋ ์ต๋ํ ์ฌ์ด ์ ๋ฌธ์ฉ์ด ์์น์ ๋ฐ๋ฅด๋ฉฐ, ์๋ฌธ ์ ๋ฌธ์ฉ์ด๋ ๋ฒ์ญ๋ ์ฌ์ด๋ง ๋ค์ ๊ดํธ ์์ ๋ฐ๋ผ๋ถ์ ๋๋ค. | |
""".strip() | |
LIMIT_FACTOR = 2.5 | |
SCORE_CUTOFF = 60.0 | |
MAX_RETRIES = 4 | |
logging.basicConfig( | |
filename=f"{__file__}.log", | |
filemode="a", | |
format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s", | |
datefmt="%H:%M:%S", | |
level=logging.DEBUG, | |
) | |
logger = logging.getLogger(__file__) | |
df = pd.read_csv("./dictionary.csv") | |
grouped_df = df.groupby("name").agg(lambda x: ",".join(map(str, x))) | |
grouped_df = grouped_df.drop(columns=["Unnamed: 0"]) | |
def find_jargons(sentence: str, limit: int | None = None) -> list[str]: | |
if limit is None: | |
limit = int(len(sentence.split()) * LIMIT_FACTOR) | |
extracted = process.extract( | |
sentence, grouped_df.index, limit=limit, score_cutoff=SCORE_CUTOFF | |
) | |
return [v[0] for v in extracted] | |
def recommend_prompt(jargon: str) -> str: | |
return f"'{jargon}'์ '{grouped_df.loc[jargon].name_trans}'๋ก" | |
llm = ChatUpstage() | |
llm = llm | StrOutputParser() | |
def chainer(messages): | |
return ChatPromptTemplate.from_messages(messages) | llm | StrOutputParser() | |
SYSTEM_PROMPT = """ | |
๋๋ ์ปดํจํฐ ๊ณผํ ๋ฐ ๊ณตํ ๋ถ์ผ์ ์ ๋ฌธ ์ฉ์ด๋ฅผ ์ฌ์ด ์ฐ๋ฆฌ๋ง๋ก ๋ฒ์ญํด์ฃผ๋ ๋ฒ์ญ๊ฐ์ผ. | |
์ ๋ฌธ์ฉ์ด์ ์๋ฏธ๋ฅผ ์ ํํ ์ดํดํ๊ณ , ๊ทธ ์๋ฏธ๊ฐ ์ ํํ ์ ๋ฌ๋๋ ์ฌ์ด๋ง์ ์ฐพ์์ผ ํด. | |
์ง๋ ๊ฒ๋จน๊ฒํ๋ ์ฉ์ด(๋ถํ์ํ ํ๋ฌธํฌ)๋ฅผ ํผํ๊ณ , ๊ฐ๋ฅํ๋ฉด ์ฌ์ด๋ง์ ์ฐพ์์ผ ํด. | |
์๋ฌธ ์ ๋ฌธ์ฉ์ด๋ ํด๋น ์ฐ๋ฆฌ๋ง ๋ค์์ ๊ดํธ์์ ํญ์ ๋ฐ๋ผ๋ถ์ฌ์ผ ํด. | |
๊ธฐ์กด์ ๊ถ์์ ์ฝ๋งค์ด์ง ์๊ณ , ๊ธฐ์กด ์ฉ์ด์ฌ์ ์ด๋ ์ด๋ฏธ ๋๋ฆฌํผ์ง ์ฉ์ด์ง๋ง ์ฝ์ง์๋ค๋ฉด, ๋ณด๋ค ์ฌ์ด ์ ๋ฌธ์ฉ์ด๋ฅผ ์ฐพ์์ผ ํด. | |
์ด๋, ๊ธฐ์กด์ฉ์ด๋ ์๋ฌธ ์ ๋ฌธ์ฉ์ด์ ํจ๊ป ๊ดํธ์์ ๋ฐ๋ผ๋ถ์ฌ์ผ ํด. | |
์ฌ์ด๋ง์ ์์ ์ฐ๋ฆฌ๋ง์ ๋ปํ์ง ์์. ์ธ๋์ด๋ผ๋ ๋๋ฆฌ ์ฝ๊ฒ ๋ฐ์๋ค์ฌ์ง๋ค๋ฉด ์ฌ์ฉํด. | |
""" | |
SAMPLE_SENTENCE = "In functional programming, continuation-passing style (CPS) is a style of programming in which control is passed explicitly in the form of a continuation." | |
SAMPLE_TRANSLATION = "๊ฐ์ค์ฌ ํ๋ก๊ทธ๋๋ฐ[functional programming]์์, ๋ง์ ํ ์ผ ์ ๋ฌํ๊ธฐ[continuation-passing style, CPS]๋ ์คํํ๋ฆ[control]์ด ์ง์ ๋ง์ ํ ์ผ[continutaion]์ ํํ๋ก ์ ๋ฌ๋๋ ํ๋ก๊ทธ๋๋ฐ ์คํ์ผ์ด๋ค." | |
def translate(sentence: str) -> str: | |
# remove "{", "}" in sentence. | |
# sentence = sentence.replace("{", "").replace("}", "") | |
messages = [ | |
SystemMessage(content=SYSTEM_PROMPT), | |
HumanMessage( | |
content=f"์ ๋ฌธ ์ฉ์ด๋ฅผ ๋ฒ์ญํ ๋๋ ๋ฐ๋์ ์์ด๋ฅผ ๊ดํธ[]์ ๋ฃ์ด์ ๋ฐ๋ผ ๋ถ์ฌ์ผ ํด. ์ด ๋ฌธ์ฅ์ ๋ฒ์ญํด์ค: '{SAMPLE_SENTENCE}'" | |
), | |
AIMessage(content=SAMPLE_TRANSLATION), | |
HumanMessage( | |
content=f"์ ๋ฌธ ์ฉ์ด๋ฅผ ๋ฒ์ญํ ๋๋ ๋ฐ๋์ ์์ด๋ฅผ ๊ดํธ[]์ ๋ฃ์ด์ ๋ฐ๋ผ ๋ถ์ฌ์ผ ํด. ์ด ๋ฌธ์ฅ์ ๋ฒ์ญํด์ค: '{sentence}'" | |
), | |
] | |
initial_translation = llm.invoke(messages) | |
logger.info(initial_translation) | |
used_jargons = find_jargons(sentence) | |
messages += [ | |
AIMessage(content=initial_translation), | |
HumanMessage( | |
content=f"๋ฐฉ๊ธ ๋ฒ์ญํ ๋ฌธ์ฅ์์ '{', '.join(used_jargons)}' ์ค ์ฌ์ฉํ ์ฉ์ด๊ฐ ์๋ค๋ฉด, ์ด๋ค ์ฉ์ด๋ค๋ก ๋ฒ์ญํ๋์ง ๋งํด์ค. ์ฌ์ฉํ์ง ์์ ์ฉ์ด๋ค์ ๋ฌด์ํด๋ ๋ผ." | |
), | |
] | |
response = llm.invoke(messages) | |
logger.info(response) | |
recommendations = ", ".join(recommend_prompt(jargon) for jargon in used_jargons) | |
messages += [ | |
AIMessage(content=response), | |
HumanMessage( | |
content=f"์ด๋ฒ์๋ ์ฒ์ ๋ฒ์ญํ๋ ๋ฌธ์ฅ์ '{sentence}'๋ฅผ ๋ค์ ๋ฒ์ญํด์ฃผ๋๋ฐ, ๋ค์ ๋ชฉ๋ก์ ๋์จ ์ฌ์ด ์ ๋ฌธ์ฉ์ด ๋ฒ์ญ ์์๋ฅผ ์ฐธ๊ณ ํด์ ๋ฒ์ญ์ ํด์ค: '{recommendations}' ์ฌ์ฉํ์ง ์์ ์ฉ์ด๋ค์ ๋ฌด์ํด๋ ๋ผ. ์ถ๊ฐ ์ค๋ช ์์ด ๋ฌธ์ฅ๋ง ๋ฒ์ญํด. ์ฌ์ฉ๋ ์์ด๋ฅผ ์ฉ์ด ๋ฐ๋ก ๋ค์ ๊ดํธ []์ ๋ฃ์ด์ ๋ฐ๋ผ ๋ถ์ฌ์ค." | |
), | |
] | |
refined_translation = llm.invoke(messages) | |
logger.info(refined_translation) | |
retries = 0 | |
while "[" not in refined_translation or "]" not in refined_translation: | |
retries += 1 | |
if retries > MAX_RETRIES: | |
break | |
messages += [ | |
AIMessage(content=refined_translation), | |
HumanMessage( | |
content=f"์ ๋ฌธ์ฉ์ด๋ฅผ ๋ฒ์ญํ์ผ๋ฉด ๋ฐ๋์ ์์ด๋ฅผ ๊ดํธ[]์ ๋ฃ์ด์ ๋ฐ๋ผ ๋ถ์ฌ์ผ ํด. '์คํํ๋ฆ[control]'์ฒ๋ผ. ๋ฐฉ๊ธ ๋ฒ์ญํ '{refined_translation}'์์, ์๋ ๋ฌธ์ฅ '{sentence}'์ ์ฌ์ฉ๋ ์์ด๋ฅผ ์ฉ์ด ๋ฐ๋ก ๋ค์ ๊ดํธ []์ ๋ฃ์ด์ ๋ฐ๋ผ ๋ถ์ฌ์ค." | |
), | |
] | |
try: | |
refined_translation = llm.invoke(messages) | |
except Exception as e: | |
logger.error(e) | |
break | |
logger.info(refined_translation) | |
refined_translation = refined_translation.replace("[", "(").replace("]", ")") | |
return refined_translation | |
class PDFFile: | |
def __init__(self): | |
self.file_list = [] | |
def read_pdf(self, file_path: str) -> str: | |
# Open the PDF file | |
document = pymupdf.open(file_path) | |
text = "" | |
# Iterate through the pages | |
for page_num in range(len(document)): | |
# Extract text from each page | |
page = document.load_page(page_num) | |
text += page.get_text() | |
# Close the PDF document | |
document.close() | |
return text | |
def remove_line_breaks(self, text: str) -> list[str]: | |
# remove only single line breaks, not paragraphs | |
# find line breaks and it is not followed by a period | |
for i in range(len(text)): | |
if i == 0 or i == len(text) - 1: | |
continue | |
if text[i] == "\n" and text[i - 1] != "." and text[i + 1] != "\n": | |
text = text[:i] + " " + text[i + 1 :] | |
return text | |
def upload_file(self, file_path: str) -> list[str]: | |
self.file_list.append(file_path) | |
return self.file_list | |
def transalte_pdf( | |
self, | |
remove_line_breaks: bool, | |
save_before_translation: bool, | |
) -> str: | |
if not self.file_list: | |
return "No file uploaded yet." | |
file_out_list = [] | |
for file in self.file_list: | |
directory = os.path.dirname(file) | |
filename = os.path.basename(file) | |
# remove extension | |
filename = ".".join(filename.split(".")[:-1]) | |
pdf_text = self.read_pdf(file) | |
if remove_line_breaks: | |
pdf_text = self.remove_line_breaks(pdf_text) | |
if save_before_translation: | |
with open(f"{directory}/{filename}_pre.txt", "w") as f: | |
f.write(pdf_text) | |
file_out_list.append(f"{directory}/{filename}_pre.txt") | |
# translation = translate(pdf_text) | |
# Translation with divide and conquer with 50 sentences | |
translation = "" | |
# seperate the text into sentences | |
sentences = pdf_text.split(".") | |
for i in range(0, len(sentences), 50): | |
translation += translate(".".join(sentences[i : i + 50])) + ". " | |
with open(f"{directory}/{filename}_translated.txt", "w") as f: | |
f.write(translation) | |
file_out_list.append(f"{directory}/{filename}_translated.txt") | |
self.file_list = [] | |
# Zip the files | |
# import zipfile | |
# with zipfile.ZipFile(f"{directory}/translated_files.zip", "w") as z: | |
# for file in file_out_list: | |
# z.write(file) | |
# return f"{directory}/translated_files.zip" | |
return file_out_list | |
with gr.Blocks() as demo: | |
with gr.Tab("TEXT"): | |
chatbot = gr.Interface( | |
fn=translate, | |
inputs=gr.Textbox(label="Enter your text"), | |
outputs=[gr.Textbox(label="Translation")], | |
examples=[ | |
"In functional programming, continuation-passing style (CPS) is a style of programming in which control is passed explicitly in the form of a continuation.", | |
"In computer science, abstract interpretation is a theory of sound approximation of the semantics of computer programs, based on monotonic functions over ordered sets, especially lattices.", | |
"In computer science, functional programming is a programming paradigm where programs are constructed by applying and composing functions", | |
"Lambda calculus (also written as ฮป-calculus) is a formal system in mathematical logic for expressing computation based on function abstraction and application using variable binding and substitution", | |
"Operational semantics is a category of formal programming language semantics in which certain desired properties of a program, such as correctness, safety or security, are verified by constructing proofs from logical statements about its execution and procedures, rather than by attaching mathematical meanings to its terms (denotational semantics).", | |
"In computing and computer programming, exception handling is the process of responding to the occurrence of exceptions โ anomalous or exceptional conditions requiring special processing โ during the execution of a program.", | |
"The term redex, short for reducible expression, refers to subterms that can be reduced by one of the reduction rules.", | |
], | |
title=TITLE, | |
description=DESCRIPTION, | |
) | |
with gr.Tab("PDF"): | |
pdf_file = PDFFile() | |
upload_button = gr.UploadButton( | |
label="Upload PDF", | |
file_types=[".pdf"], | |
) | |
upload_file_list_box = gr.File(label="Uploaded Files") | |
upload_button.upload(pdf_file.upload_file, upload_button, upload_file_list_box) | |
run_translator = gr.Interface( | |
fn=pdf_file.transalte_pdf, | |
inputs=[ | |
gr.Checkbox(label="Remove line breaks"), | |
gr.Checkbox(label="Save before translation"), | |
], | |
outputs=[gr.File(label="Download Translated Files")], | |
) | |
def main(): | |
demo.launch(share=True) | |
if __name__ == "__main__": | |
main() | |