Spaces:
Runtime error
Runtime error
| # https://qiita.com/nekoniii3/items/5acf764af65212d9f04f | |
| import gradio as gr | |
| import os | |
| from langchain_community.document_loaders import PyMuPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_openai import ChatOpenAI | |
| from langchain_community.vectorstores import Chroma | |
| from langchain.chains import RetrievalQA | |
| # from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| # os.environ["OPENAI_API_KEY"] = "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" | |
| file_name1 = 'ALV2_ALV3DTU操作マニュアルDTU-V3SET01.pdf' | |
| file_name2 = 'ALV3PCサーバ_ソフトウェア操作マニュアル_画像ファイル名付.pdf' | |
| file_name3 = '美和ロック総合カタログ第31版_前半.pdf' | |
| file_name4 = '美和ロック総合カタログ第31版_後半.pdf' | |
| loader1 = PyMuPDFLoader(file_name1) | |
| loader2 = PyMuPDFLoader(file_name2) | |
| loader3 = PyMuPDFLoader(file_name3) | |
| loader4 = PyMuPDFLoader(file_name4) | |
| documents1 = loader1.load() | |
| documents2 = loader2.load() | |
| documents3 = loader3.load() | |
| documents4 = loader4.load() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| texts1 = text_splitter.split_documents(documents1) | |
| texts2 = text_splitter.split_documents(documents2) | |
| texts3 = text_splitter.split_documents(documents3) | |
| texts4 = text_splitter.split_documents(documents4) | |
| texts = texts1 + texts2 + texts3 + texts4 | |
| # embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") | |
| embeddings = HuggingFaceEmbeddings(model_name="oshizo/sbert-jsnli-luke-japanese-base-lite") | |
| vectordb = Chroma.from_documents(texts, embeddings) | |
| llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.05) | |
| qa = RetrievalQA.from_chain_type( | |
| llm=llm, | |
| chain_type="stuff", | |
| retriever=vectordb.as_retriever(), | |
| return_source_documents=True) | |
| import shutil | |
| def save_image_filepath(filepath: str): | |
| print(filepath) | |
| # イメージを保存 | |
| _, file_extension = os.path.splitext(filepath) | |
| shutil.copy(filepath, './filepath{}'.format(file_extension)) | |
| pass | |
| import boto3 | |
| s3 = boto3.client('s3', | |
| aws_access_key_id="AKIA6ENMUHYQ7KWAEV7Q", | |
| aws_secret_access_key="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", | |
| region_name='ap-northeast-1' | |
| ) | |
| # 画像のURL出力機能 | |
| def get_public_url(bucket, target_object_path): | |
| """ | |
| 対象のS3ファイルのURLを取得する | |
| Parameters | |
| ---------- | |
| bucket: string | |
| S3のバケット名 | |
| target_object_path: string | |
| 取得したいS3内のファイルパス | |
| Returns | |
| ---------- | |
| url: string | |
| S3上のオブジェクトのURL | |
| """ | |
| bucket_location = s3.get_bucket_location(Bucket=bucket) | |
| return "https://s3-{0}.amazonaws.com/{1}/{2}".format( | |
| bucket_location['LocationConstraint'], | |
| bucket, | |
| target_object_path) | |
| import fitz | |
| doc1 = fitz.open(file_name1) | |
| doc2 = fitz.open(file_name2) | |
| import math | |
| with gr.Blocks() as demo: | |
| chatbot = gr.Chatbot() | |
| msg = gr.Textbox() | |
| def user(user_message, history): | |
| reply2 = qa(user_message) | |
| reply=reply2['result'] | |
| for sd in reply2["source_documents"]: | |
| # page_content = str(sd.page_content) | |
| source = str(sd.metadata["source"]) | |
| page = sd.metadata["page"]+1 | |
| page_num = str(page).zfill(3) | |
| # print("PDF:" + source) | |
| # print("ページ:" + page_num) | |
| if source == file_name1: | |
| # ページ画像のURLを取得 | |
| bucket='page.dtu.manual' | |
| key='page'+page_num+'_raster.png' | |
| url = get_public_url(bucket, key) | |
| reply = reply + ' <a href='+url+'>'+page_num+'</a>' | |
| elif source == file_name2: | |
| # ページ画像のURLを取得 | |
| bucket='page.server.manual' | |
| key='page'+page_num+'_raster.png' | |
| url = get_public_url(bucket, key) | |
| reply = reply + ' <a href='+url+'>'+page_num+'</a>' | |
| # PDFに貼り付けある画像のURLを取得 | |
| bucket='image.server.manual' | |
| page2 = doc2[page] | |
| page_annotations = page2.annots() | |
| for annotation in page_annotations: | |
| annotation_num = str(annotation).zfill(3) | |
| # 注釈のプロパティを取得 | |
| key = annotation.info.get('content', '') # ノート注釈のテキストを取得 | |
| url = get_public_url(bucket, key) | |
| reply = reply + ' <a href='+url+'>'+key+'</a>' | |
| elif source == file_name3: | |
| page2 = str(math.floor(1+float(page_num)/2)) | |
| url = "https://dcs.mediapress-net.com/iportal/cv.do?c=20958580000&pg="+page2+"&v=MIW10001&d=LINK_MIW" | |
| reply = reply + ' <a href="'+url+'">'+page2+'</a>' | |
| elif source == file_name4: | |
| page2 = str(math.floor(1+(486+float(page_num))/2)) | |
| url = "https://dcs.mediapress-net.com/iportal/cv.do?c=20958580000&pg="+page2+"&v=MIW10001&d=LINK_MIW" | |
| reply = reply + ' <a href="'+url+'">'+page2+'</a>' | |
| else: | |
| exit(0) | |
| return "", history + [[user_message, reply]] | |
| def bot(history): | |
| yield history | |
| msg.submit(user, [msg, chatbot], [msg, chatbot], queue=True).then( | |
| bot, chatbot, chatbot | |
| ) | |
| demo.queue() | |
| demo.launch(share=True) | |