# https://qiita.com/nekoniii3/items/5acf764af65212d9f04f import gradio as gr import os from langchain_community.document_loaders import PyMuPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import ChatOpenAI from langchain_community.vectorstores import Chroma from langchain.chains import RetrievalQA # from langchain_openai import OpenAIEmbeddings from langchain_community.embeddings import HuggingFaceEmbeddings os.environ["TOKENIZERS_PARALLELISM"] = "false" # os.environ["OPENAI_API_KEY"] = "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" file_name1 = 'ALV2_ALV3DTU操作マニュアルDTU-V3SET01.pdf' file_name2 = 'ALV3PCサーバ_ソフトウェア操作マニュアル_画像ファイル名付.pdf' file_name3 = '美和ロック総合カタログ第31版_前半.pdf' file_name4 = '美和ロック総合カタログ第31版_後半.pdf' loader1 = PyMuPDFLoader(file_name1) loader2 = PyMuPDFLoader(file_name2) loader3 = PyMuPDFLoader(file_name3) loader4 = PyMuPDFLoader(file_name4) documents1 = loader1.load() documents2 = loader2.load() documents3 = loader3.load() documents4 = loader4.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) texts1 = text_splitter.split_documents(documents1) texts2 = text_splitter.split_documents(documents2) texts3 = text_splitter.split_documents(documents3) texts4 = text_splitter.split_documents(documents4) texts = texts1 + texts2 + texts3 + texts4 # embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") embeddings = HuggingFaceEmbeddings(model_name="oshizo/sbert-jsnli-luke-japanese-base-lite") vectordb = Chroma.from_documents(texts, embeddings) llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.05) qa = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=vectordb.as_retriever(), return_source_documents=True) import shutil def save_image_filepath(filepath: str): print(filepath) # イメージを保存 _, file_extension = os.path.splitext(filepath) shutil.copy(filepath, './filepath{}'.format(file_extension)) pass import boto3 s3 = boto3.client('s3', aws_access_key_id="AKIA6ENMUHYQ7KWAEV7Q", aws_secret_access_key="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", region_name='ap-northeast-1' ) # 画像のURL出力機能 def get_public_url(bucket, target_object_path): """ 対象のS3ファイルのURLを取得する Parameters ---------- bucket: string S3のバケット名 target_object_path: string 取得したいS3内のファイルパス Returns ---------- url: string S3上のオブジェクトのURL """ bucket_location = s3.get_bucket_location(Bucket=bucket) return "https://s3-{0}.amazonaws.com/{1}/{2}".format( bucket_location['LocationConstraint'], bucket, target_object_path) import fitz doc1 = fitz.open(file_name1) doc2 = fitz.open(file_name2) import math with gr.Blocks() as demo: chatbot = gr.Chatbot() msg = gr.Textbox() def user(user_message, history): reply2 = qa(user_message) reply=reply2['result'] for sd in reply2["source_documents"]: # page_content = str(sd.page_content) source = str(sd.metadata["source"]) page = sd.metadata["page"]+1 page_num = str(page).zfill(3) # print("PDF:" + source) # print("ページ:" + page_num) if source == file_name1: # ページ画像のURLを取得 bucket='page.dtu.manual' key='page'+page_num+'_raster.png' url = get_public_url(bucket, key) reply = reply + ' '+page_num+'' elif source == file_name2: # ページ画像のURLを取得 bucket='page.server.manual' key='page'+page_num+'_raster.png' url = get_public_url(bucket, key) reply = reply + ' '+page_num+'' # PDFに貼り付けある画像のURLを取得 bucket='image.server.manual' page2 = doc2[page] page_annotations = page2.annots() for annotation in page_annotations: annotation_num = str(annotation).zfill(3) # 注釈のプロパティを取得 key = annotation.info.get('content', '') # ノート注釈のテキストを取得 url = get_public_url(bucket, key) reply = reply + ' '+key+'' elif source == file_name3: page2 = str(math.floor(1+float(page_num)/2)) url = "https://dcs.mediapress-net.com/iportal/cv.do?c=20958580000&pg="+page2+"&v=MIW10001&d=LINK_MIW" reply = reply + ' '+page2+'' elif source == file_name4: page2 = str(math.floor(1+(486+float(page_num))/2)) url = "https://dcs.mediapress-net.com/iportal/cv.do?c=20958580000&pg="+page2+"&v=MIW10001&d=LINK_MIW" reply = reply + ' '+page2+'' else: exit(0) return "", history + [[user_message, reply]] def bot(history): yield history msg.submit(user, [msg, chatbot], [msg, chatbot], queue=True).then( bot, chatbot, chatbot ) demo.queue() demo.launch(share=True)