3gg commited on
Commit
5c402d8
1 Parent(s): 804c5c7

Use next tier model for better results; parse document from PDF.

Browse files
.gitattributes CHANGED
@@ -1,34 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,19 +1,29 @@
1
  import gradio as gr
2
  from langchain import HuggingFaceHub
3
  from langchain.chains.question_answering import load_qa_chain
4
- from langchain.document_loaders import TextLoader
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.llms import HuggingFacePipeline
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain.vectorstores import FAISS
9
 
10
 
 
 
 
 
 
 
 
 
 
 
11
  print("Loading documents")
12
- loader = TextLoader("rdna3.txt")
13
  documents = loader.load()
14
 
15
  print("Creating chunks")
16
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
17
  chunks = splitter.split_documents(documents)
18
 
19
  print("Creating database")
@@ -22,13 +32,13 @@ db = FAISS.from_documents(chunks, embeddings)
22
 
23
  print("Loading model")
24
  llm = HuggingFacePipeline.from_model_id(
25
- model_id="google/flan-t5-base",
26
  task="text2text-generation",
27
- model_kwargs={"temperature": 0, "max_length": 128})
28
  chain = load_qa_chain(llm, chain_type="stuff")
29
 
30
  def ask(question):
31
- answers = db.similarity_search(question, k=4)
32
  result = chain.run(input_documents=answers, question=question)
33
  return result
34
 
 
1
  import gradio as gr
2
  from langchain import HuggingFaceHub
3
  from langchain.chains.question_answering import load_qa_chain
4
+ from langchain.document_loaders import PyMuPDFLoader
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.llms import HuggingFacePipeline
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain.vectorstores import FAISS
9
 
10
 
11
+ # Number of search results to query from the vector database.
12
+ SIMILARITY_SEARCH_COUNT = 2
13
+
14
+ # Size of each document chunk in number of characters.
15
+ CHUNK_SIZE = 1000
16
+
17
+ # Maximum number of output tokens.
18
+ MODEL_MAX_LENGTH = 300
19
+
20
+
21
  print("Loading documents")
22
+ loader = PyMuPDFLoader("rdna3-shader-instruction-set-architecture-feb-2023_0.pdf")
23
  documents = loader.load()
24
 
25
  print("Creating chunks")
26
+ splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0)
27
  chunks = splitter.split_documents(documents)
28
 
29
  print("Creating database")
 
32
 
33
  print("Loading model")
34
  llm = HuggingFacePipeline.from_model_id(
35
+ model_id="google/flan-t5-large",
36
  task="text2text-generation",
37
+ model_kwargs={"temperature": 0, "max_length": MODEL_MAX_LENGTH})
38
  chain = load_qa_chain(llm, chain_type="stuff")
39
 
40
  def ask(question):
41
+ answers = db.similarity_search(question, k=SIMILARITY_SEARCH_COUNT)
42
  result = chain.run(input_documents=answers, question=question)
43
  return result
44
 
rdna3-shader-instruction-set-architecture-feb-2023_0.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bff84b0bc818446356e73ba894149b8c810549fa240a9872b46179f412fcd13b
3
+ size 3246429
requirements.txt CHANGED
@@ -2,3 +2,4 @@ langchain
2
  faiss-cpu
3
  sentence_transformers
4
  protobuf==3.20.1
 
 
2
  faiss-cpu
3
  sentence_transformers
4
  protobuf==3.20.1
5
+ pymupdf