Make base QA system

Browse files

Files changed (12) hide show

.gitignore +2 -0
README.md +22 -2
app.py +31 -0
constraints.txt +102 -0
podcast-75/chroma-collections.parquet +0 -0
podcast-75/chroma-embeddings.parquet +0 -0
podcast-75/index/id_to_uuid_06a2a225-2cc5-4db6-bb03-5b572e891007.pkl +0 -0
podcast-75/index/index_06a2a225-2cc5-4db6-bb03-5b572e891007.bin +0 -0
podcast-75/index/index_metadata_06a2a225-2cc5-4db6-bb03-5b572e891007.pkl +0 -0
podcast-75/index/uuid_to_id_06a2a225-2cc5-4db6-bb03-5b572e891007.pkl +0 -0
requirements.txt +5 -0
store.py +42 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv/*
2	+ __pychache__

README.md CHANGED Viewed

@@ -7,7 +7,27 @@ sdk: gradio
 sdk_version: 3.35.2
 app_file: app.py
 pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 sdk_version: 3.35.2
 app_file: app.py
 pinned: false
+license: other
 ---
+# 概要
+- PyHack Con https://pyhack.connpass.com/event/282942/ の座談会の内容をQA形式で回答
+- Whisperを使って文字起こししたデータを利用（ただし、文字修正はしていない）
+- 音声データは、https://podcast.terapyon.net/episodes/0084.html の内容
+# 技術要件
+## Pythonパッケージ
+```
+pip install langchain
+pip install openai
+pip install chromadb
+pip install tiktoken
+pip install gradio
+```

app.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import gradio as gr
+from langchain.chains import RetrievalQA
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.llms import OpenAI
+from langchain.vectorstores import Chroma
+PERSIST_DIR_NAME = "podcast-75"
+def get_retrieval_qa() -> RetrievalQA:
+    embeddings = OpenAIEmbeddings()
+    db = Chroma(persist_directory=PERSIST_DIR_NAME, embedding_function=embeddings)
+    retriever = db.as_retriever()
+    return RetrievalQA.from_chain_type(
+        llm=OpenAI(), chain_type="stuff", retriever=retriever
+    )
+def main(query: str):
+    qa = get_retrieval_qa()
+    answer = qa(query)
+    return answer["result"]
+pyhack_qa = gr.Interface(
+    fn=main,
+    inputs=[gr.Textbox(label="query")],
+    outputs="text",
+)
+pyhack_qa.launch()

constraints.txt ADDED Viewed

	@@ -0,0 +1,102 @@

+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==5.0.1
+anyio==3.7.0
+async-timeout==4.0.2
+attrs==23.1.0
+backoff==2.2.1
+certifi==2023.5.7
+charset-normalizer==3.1.0
+chromadb==0.3.26
+click==8.1.3
+clickhouse-connect==0.6.3
+coloredlogs==15.0.1
+contourpy==1.1.0
+cycler==0.11.0
+dataclasses-json==0.5.8
+duckdb==0.8.1
+exceptiongroup==1.1.1
+fastapi==0.98.0
+ffmpy==0.3.0
+filelock==3.12.2
+flatbuffers==23.5.26
+fonttools==4.40.0
+frozenlist==1.3.3
+fsspec==2023.6.0
+gradio==3.35.2
+gradio_client==0.2.7
+greenlet==2.0.2
+h11==0.14.0
+hnswlib==0.7.0
+httpcore==0.17.2
+httptools==0.5.0
+httpx==0.24.1
+huggingface-hub==0.15.1
+humanfriendly==10.0
+idna==3.4
+Jinja2==3.1.2
+jsonschema==4.17.3
+kiwisolver==1.4.4
+langchain==0.0.209
+langchainplus-sdk==0.0.16
+linkify-it-py==2.0.2
+lz4==4.3.2
+markdown-it-py==2.2.0
+MarkupSafe==2.1.3
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+matplotlib==3.7.1
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+numexpr==2.8.4
+numpy==1.25.0
+onnxruntime==1.15.1
+openai==0.27.8
+openapi-schema-pydantic==1.2.4
+orjson==3.9.1
+overrides==7.3.1
+packaging==23.1
+pandas==2.0.2
+Pillow==9.5.0
+posthog==3.0.1
+protobuf==4.23.3
+pulsar-client==3.2.0
+pydantic==1.10.9
+pydub==0.25.1
+Pygments==2.15.1
+pyparsing==3.1.0
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-multipart==0.0.6
+pytz==2023.3
+PyYAML==6.0
+regex==2023.6.3
+requests==2.31.0
+semantic-version==2.10.0
+six==1.16.0
+sniffio==1.3.0
+SQLAlchemy==2.0.16
+starlette==0.27.0
+sympy==1.12
+tenacity==8.2.2
+tiktoken==0.4.0
+tokenizers==0.13.3
+toolz==0.12.0
+tqdm==4.65.0
+typing-inspect==0.9.0
+typing_extensions==4.6.3
+tzdata==2023.3
+uc-micro-py==1.0.2
+urllib3==2.0.3
+uvicorn==0.22.0
+uvloop==0.17.0
+watchfiles==0.19.0
+websockets==11.0.3
+yarl==1.9.2
+zstandard==0.21.0

podcast-75/chroma-collections.parquet ADDED Viewed

Binary file (557 Bytes). View file

podcast-75/chroma-embeddings.parquet ADDED Viewed

Binary file (573 kB). View file

podcast-75/index/id_to_uuid_06a2a225-2cc5-4db6-bb03-5b572e891007.pkl ADDED Viewed

Binary file (1.42 kB). View file

podcast-75/index/index_06a2a225-2cc5-4db6-bb03-5b572e891007.bin ADDED Viewed

Binary file (277 kB). View file

podcast-75/index/index_metadata_06a2a225-2cc5-4db6-bb03-5b572e891007.pkl ADDED Viewed

Binary file (103 Bytes). View file

podcast-75/index/uuid_to_id_06a2a225-2cc5-4db6-bb03-5b572e891007.pkl ADDED Viewed

Binary file (1.64 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+langchain
+openai
+chromadb
+tiktoken
+gradio

store.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from langchain.document_loaders import TextLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import Chroma
+CHUNK_SIZE = 500
+def get_documents(filename: str):
+    loader = TextLoader(filename)
+    docs = loader.load()
+    return docs
+def get_text_chunk(docs):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0)
+    texts = text_splitter.split_documents(docs)
+    return texts
+def store(texts, dir_name):
+    embeddings = OpenAIEmbeddings()
+    db = Chroma.from_documents(texts, embeddings, persist_directory=dir_name)
+    db.persist()
+def main(filename: str, dir_name: str):
+    docs = get_documents(filename)
+    texts = get_text_chunk(docs)
+    store(texts, dir_name)
+if __name__ == "__main__":
+    import sys
+    args = sys.argv
+    if len(args) != 3:
+        print("No args, you need one args for text filename")
+    else:
+        filename = args[1]
+        dir_name = args[2]
+        main(filename, dir_name)