Spaces:

yiyii
/

RAG-3

Running

yiyii commited on May 14

Commit

5668875

•

1 Parent(s): f00cb87

eight

Files changed (2) hide show

README.md CHANGED Viewed

	@@ -24,5 +24,6 @@ let user decide the value of chunk_size and top-k
24
25	tell it in the prompt directly how long the story should approximately be.
26
27	-

28

 tell it in the prompt directly how long the story should approximately be.
+14.05.2024
+https://huggingface.co/spaces/cvachet/pdf-chatbot/blob/main/app.py

app.py CHANGED Viewed

@@ -81,12 +81,16 @@ def generate(image, pdfs, temperature=0.9, max_new_tokens=1500, top_p=0.95, repe
     for file in pdfs:
         with open(file.name, "rb") as f:
             state_of_the_union += loader.load(f)
     # split the content into chunks
     text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     # TokenTextSplitter() can ensure the integrity of words
     # each chunk to overlap with the previous chunk by 20 tokens
-    texts = text_splitter.split_text(state_of_the_union)
     print("...........................................")
     # print the first chunk
     print("text[0]: ", texts[0])
@@ -191,7 +195,7 @@ demo = gr.Interface(fn=generate,
             #gr.Video(sources=["webcam"], label="video")
             gr.Image(sources=["upload", "webcam"], label="Upload Image", type="pil"),
-            gr.Files(label="Upload PDFs", type="file", accept=".pdf", multiple=True),
             gr.Slider(
                 label="temperature",

     for file in pdfs:
         with open(file.name, "rb") as f:
             state_of_the_union += loader.load(f)
+        # r: read
+        # b: binary. the file is opened in binary mode. for non-text file
     # split the content into chunks
     text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     # TokenTextSplitter() can ensure the integrity of words
     # each chunk to overlap with the previous chunk by 20 tokens
+    #texts = text_splitter.split_text(state_of_the_union)
+    texts = text_splitter.split_documents(state_of_the_union)
     print("...........................................")
     # print the first chunk
     print("text[0]: ", texts[0])
             #gr.Video(sources=["webcam"], label="video")
             gr.Image(sources=["upload", "webcam"], label="Upload Image", type="pil"),
+            gr.Files(file_count="multiple", file_types=["pdf"], interactive=True, label="Upload your PDFs"),
             gr.Slider(
                 label="temperature",