Spaces:

darkbladecr
/

quesbook_search

Sleeping

App Files Files

Stefan commited on May 25, 2023

Commit

bb3407a

1 Parent(s): 7ce98a0

feat(setup): initial commit

Browse files

Files changed (10) hide show

.gitignore +2 -0
.vscode/settings.json +3 -0
Pipfile +33 -0
Pipfile.lock +0 -0
embedding.py +48 -0
main.py +28 -0
pg.py +41 -0
processing.py +95 -0
requirements.txt +97 -0
vectors.py +38 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ data*/
2	+ .env

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "editor.defaultFormatter": "ms-python.black-formatter"
+}

Pipfile ADDED Viewed

	@@ -0,0 +1,33 @@

+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+[packages]
+numpy = "*"
+pandas = "*"
+torch = "*"
+transformers = "*"
+accelerate = "*"
+sentencepiece = "*"
+protobuf = "==3.20.1"
+aiohttp = "*"
+aiodns = "*"
+brotli = "*"
+python-dotenv = "*"
+openai = "*"
+nest-asyncio = "*"
+tqdm = "*"
+tiktoken = "*"
+instructorembedding = "*"
+markdown = "*"
+sentence-transformers = "*"
+pinecone-client = "*"
+psycopg2 = "*"
+gradio = "*"
+[dev-packages]
+ipykernel = "*"
+[requires]
+python_version = "3.11"

Pipfile.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

embedding.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from torch import Tensor
+import tiktoken
+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-large-v2")
+model = AutoModel.from_pretrained("intfloat/e5-large-v2")
+EMBEDDING_CHAR_LIMIT = 512
+def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+def strings_to_vectors(strings: list[str]):
+    passage_batch = tokenizer(
+        strings,
+        max_length=EMBEDDING_CHAR_LIMIT,
+        padding=True,
+        truncation=True,
+        return_tensors="pt",
+    )
+    passage_outputs = model(**passage_batch)
+    return average_pool(
+        passage_outputs.last_hidden_state, passage_batch["attention_mask"]
+    )
+def num_tokens_from_str(string, model="gpt-3.5-turbo"):
+    """Returns the number of tokens used by a list of messages."""
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        encoding = tiktoken.get_encoding("cl100k_base")
+    if model == "gpt-3.5-turbo":  # note: future models may deviate from this
+        num_tokens = 0
+        num_tokens += (
+            4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
+        )
+        num_tokens += len(encoding.encode(string))
+        num_tokens += 2  # every reply is primed with <im_start>assistant
+        return num_tokens
+    else:
+        raise NotImplementedError(
+            f"""num_tokens_from_messages() is not presently implemented for model {model}.
+  See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
+        )

main.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import gradio as gr
+from processing import md_to_passages
+from pg import get_chapters
+from vectors import match_query
+def find_embedding(query: str):
+    top_res = match_query(query, 3)
+    # print(top_res)
+    chapters = get_chapters(list(map(lambda x: x["metadata"]["chapterId"], top_res)))
+    output = ""
+    for res, chapter in zip(top_res, chapters):
+        passages = md_to_passages(chapter["explanation"])
+        output += f"{res['id']}\t| score: {res['score']:.2f}%\n{passages[res['passage_idx']]}\n\n"
+    return output
+with gr.Blocks() as quesbook_search:
+    question = gr.Text(label="question")
+    answer = gr.Text(label="answer")
+    submit = gr.Button("Submit")
+    submit.click(fn=find_embedding, inputs=question, outputs=answer)
+quesbook_search.launch()

pg.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import psycopg2
+import os
+pg = psycopg2.connect(
+    dbname=os.getenv("POSTGRES_DB"),
+    user=os.getenv("POSTGRES_USER"),
+    password=os.getenv("POSTGRES_PASSWORD"),
+    port=os.getenv("POSTGRES_PORT"),
+    host=os.getenv("POSTGRES_HOST"),
+)
+def get_chapters(ids: list[int]):
+    cur = pg.cursor()
+    cur.execute(
+        """
+SELECT
+  ch.id,
+  ch.explanation
+FROM
+  chapters ch
+WHERE
+  ch.id = ANY (%s);
+    """,
+        (ids,),
+    )
+    data = cur.fetchall()
+    cur.close()
+    chapters = list(map(lambda x: {"id": x[0], "explanation": x[1]}, data))
+    ordered_chapters = []
+    for id in ids:
+        chapter = next(
+            (ch for ch in chapters if ch["id"] == id),
+            None,
+        )
+        if chapter:
+            ordered_chapters.append(chapter)
+    return ordered_chapters

processing.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from markdown import Markdown
+from io import StringIO
+import re
+from embedding import num_tokens_from_str, EMBEDDING_CHAR_LIMIT
+HTMLR = re.compile("<.*?>")
+WS = re.compile("\s+")
+LIGHTGALLERY = re.compile("\[lightgallery.*\]")
+def unmark_element(element, stream=None):
+    if stream is None:
+        stream = StringIO()
+    if element.text:
+        stream.write(element.text)
+    for sub in element:
+        unmark_element(sub, stream)
+    if element.tail:
+        stream.write(element.tail)
+    return stream.getvalue()
+# patching Markdown
+Markdown.output_formats["plain"] = unmark_element
+__md = Markdown(output_format="plain", extensions=["tables"])
+__md.stripTopLevelTags = False
+def unmark(text):
+    return __md.convert(text)
+def clean_md(text: str) -> list[str]:
+    cleantext = re.sub(HTMLR, "", text)
+    cleantext = re.sub(LIGHTGALLERY, "", cleantext)
+    para = cleantext.split("\n#")
+    para = [unmark(p) for p in para]
+    para = [re.sub(WS, " ", p.lower()) for p in para]
+    return para
+start_seq_length = num_tokens_from_str("passage: ")
+def truncate_to_sequences(text: str, max_char=EMBEDDING_CHAR_LIMIT) -> list[str]:
+    sequence_length = num_tokens_from_str(text) // (max_char - start_seq_length) + 1
+    length = len(text)
+    separator = length // sequence_length
+    sequences = []
+    base = 0
+    while base < length:
+        count = len(sequences) + 1
+        end = min(separator * count, length)
+        found = False
+        if end == length:
+            found = True
+        if found is False:
+            section = text[base:end]
+            section_rev = section[::-1]
+            for i in range(len(section_rev)):
+                if section_rev[i : i + 2] == " .":
+                    found = True
+                    end -= 1
+                    break
+                end -= 1
+            if found is False:
+                end = separator * count
+                for i in range(len(section_rev)):
+                    if section_rev[i] == " ":
+                        found = True
+                        break
+                    end -= 1
+        if num_tokens_from_str(text[base:end]) > max_char:
+            sub_sequences = truncate_to_sequences(text[base:end])
+            sequences += sub_sequences
+        else:
+            sequences.append(text[base:end])
+        base = base + end
+    return sequences
+def md_to_passages(md: str) -> list[str]:
+    initial_passages = clean_md(md)
+    passages = []
+    for p in initial_passages:
+        sequences = truncate_to_sequences(p)
+        passages += sequences
+    return passages

requirements.txt ADDED Viewed

	@@ -0,0 +1,97 @@

+-i https://pypi.org/simple
+accelerate==0.19.0
+aiodns==3.0.0
+aiofiles==23.1.0 ; python_version >= '3.7' and python_version < '4.0'
+aiohttp==3.8.4
+aiosignal==1.3.1 ; python_version >= '3.7'
+altair==5.0.0 ; python_version >= '3.7'
+anyio==3.6.2 ; python_full_version >= '3.6.2'
+async-timeout==4.0.2 ; python_version >= '3.6'
+attrs==23.1.0 ; python_version >= '3.7'
+brotli==1.0.9
+certifi==2023.5.7 ; python_version >= '3.6'
+cffi==1.15.1
+charset-normalizer==3.1.0 ; python_full_version >= '3.7.0'
+click==8.1.3 ; python_version >= '3.7'
+contourpy==1.0.7 ; python_version >= '3.8'
+cycler==0.11.0 ; python_version >= '3.6'
+dnspython==2.3.0 ; python_version >= '3.7' and python_version < '4.0'
+fastapi==0.95.2 ; python_version >= '3.7'
+ffmpy==0.3.0
+filelock==3.12.0 ; python_version >= '3.7'
+fonttools==4.39.4 ; python_version >= '3.8'
+frozenlist==1.3.3 ; python_version >= '3.7'
+fsspec==2023.5.0 ; python_version >= '3.8'
+gradio==3.32.0
+gradio-client==0.2.5 ; python_version >= '3.7'
+h11==0.14.0 ; python_version >= '3.7'
+httpcore==0.17.2 ; python_version >= '3.7'
+httpx==0.24.1 ; python_version >= '3.7'
+huggingface-hub==0.14.1 ; python_full_version >= '3.7.0'
+idna==3.4 ; python_version >= '3.5'
+instructorembedding==1.0.0
+jinja2==3.1.2 ; python_version >= '3.7'
+joblib==1.2.0 ; python_version >= '3.7'
+jsonschema==4.17.3 ; python_version >= '3.7'
+kiwisolver==1.4.4 ; python_version >= '3.7'
+linkify-it-py==2.0.2
+loguru==0.7.0 ; python_version >= '3.5'
+markdown==3.4.3
+markdown-it-py[linkify]==2.2.0 ; python_version >= '3.7'
+markupsafe==2.1.2 ; python_version >= '3.7'
+matplotlib==3.7.1 ; python_version >= '3.8'
+mdit-py-plugins==0.3.3 ; python_version >= '3.7'
+mdurl==0.1.2 ; python_version >= '3.7'
+mpmath==1.3.0
+multidict==6.0.4 ; python_version >= '3.7'
+nest-asyncio==1.5.6
+networkx==3.1 ; python_version >= '3.8'
+nltk==3.8.1 ; python_version >= '3.7'
+numpy==1.24.3
+openai==0.27.7
+orjson==3.8.13 ; python_version >= '3.7'
+packaging==23.1 ; python_version >= '3.7'
+pandas==2.0.1
+pillow==9.5.0 ; python_version >= '3.7'
+pinecone-client==2.2.1
+protobuf==3.20.1
+psutil==5.9.5 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
+psycopg2==2.9.6
+pycares==4.3.0
+pycparser==2.21
+pydantic==1.10.8 ; python_version >= '3.7'
+pydub==0.25.1
+pygments==2.15.1 ; python_version >= '3.7'
+pyparsing==3.0.9 ; python_full_version >= '3.6.8'
+pyrsistent==0.19.3 ; python_version >= '3.7'
+python-dateutil==2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
+python-dotenv==1.0.0
+python-multipart==0.0.6 ; python_version >= '3.7'
+pytz==2023.3
+pyyaml==6.0 ; python_version >= '3.6'
+regex==2023.5.5 ; python_version >= '3.6'
+requests==2.31.0 ; python_version >= '3.7'
+scikit-learn==1.2.2 ; python_version >= '3.8'
+scipy==1.10.1 ; python_version < '3.12' and python_version >= '3.8'
+semantic-version==2.10.0 ; python_version >= '2.7'
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+six==1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
+sniffio==1.3.0 ; python_version >= '3.7'
+starlette==0.27.0 ; python_version >= '3.7'
+sympy==1.12 ; python_version >= '3.8'
+threadpoolctl==3.1.0 ; python_version >= '3.6'
+tiktoken==0.4.0
+tokenizers==0.13.3
+toolz==0.12.0 ; python_version >= '3.5'
+torch==2.0.1
+torchvision==0.15.2 ; python_version >= '3.8'
+tqdm==4.65.0
+transformers==4.29.2
+typing-extensions==4.6.1 ; python_version >= '3.7'
+tzdata==2023.3 ; python_version >= '2'
+uc-micro-py==1.0.2 ; python_version >= '3.7'
+urllib3==2.0.2 ; python_version >= '3.7'
+uvicorn==0.22.0 ; python_version >= '3.7'
+websockets==11.0.3 ; python_version >= '3.7'
+yarl==1.9.2 ; python_version >= '3.7'

vectors.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from embedding import strings_to_vectors
+import pinecone
+import os
+PINECONE_API = os.getenv("PINECONE_API")
+pinecone.init(api_key=PINECONE_API, environment="us-west4-gcp-free")
+vector_index = pinecone.Index("quesmed")
+def scored_vector_todict(scored_vector):
+    x = {
+        "id": scored_vector["id"],
+        "metadata": {
+            "topicId": int(scored_vector["metadata"]["topicId"]),
+            "chapterId": int(scored_vector["metadata"]["chapterId"]),
+            "conceptId": int(scored_vector["metadata"]["conceptId"]),
+        },
+        "score": scored_vector["score"] * 100,
+        "values": scored_vector["values"],
+    }
+    for k, v in x["metadata"].items():
+        x[k] = int(v)
+    x["passage_idx"] = int(x["id"][-1])
+    return x
+def match_query(query: str, n_res=3):
+    queries = [f"query: {query.replace('?','').lower()}"]
+    query_embeddings = strings_to_vectors(queries)
+    result = vector_index.query(
+        query_embeddings[0].tolist(),
+        top_k=n_res,
+        include_metadata=True,
+        namespace="quesbook",
+    )
+    return list(map(scored_vector_todict, result["matches"]))