File size: 3,115 Bytes
28c2a3d bb4d908 28c2a3d bb4d908 28c2a3d ce9539c 28c2a3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import abc
import pandas as pd
import pickle
class SimilarityAlg(metaclass=abc.ABCMeta):
"""Similarity Algorithm to compute similarity between query_embedding and embeddings"""
def __init__(self) -> None:
pass
@abc.abstractmethod
def __call__(self, query_embedding, embeddings) -> None:
pass
class Embedding_Model(metaclass=abc.ABCMeta):
"""Embedding Model to compute embedding of a text"""
def __init__(self, model_name) -> None:
"""Initialize the embedding model"""
embedding_cache_path = f"/app/ckpt/embedding_cache_{model_name}.pkl"
self.embedding_cache_path = embedding_cache_path
# load the cache if it exists, and save a copy to disk
try:
embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
pickle.dump(embedding_cache, embedding_cache_file)
self.embedding_cache = embedding_cache
self.model_name = model_name
@abc.abstractmethod
def __call__(self, text) -> None:
"""Compute the embedding of the text"""
pass
class AbstractPDFParser(metaclass=abc.ABCMeta):
""" PDF parser to parse a PDF file"""
def __init__(self, db_name) -> None:
"""Initialize the pdf database"""
db_cache_path = f"/app/ckpt/pdf_parser_{db_name}.pkl"
self.db_cache_path = db_cache_path
# load the cache if it exists, and save a copy to disk
try:
db_cache = pd.read_pickle(db_cache_path)
except FileNotFoundError:
db_cache = {}
with open(db_cache_path, "wb") as cache_file:
pickle.dump(db_cache, cache_file)
self.db_cache = db_cache
self.db_name = db_name
@abc.abstractmethod
def parse_pdf(self,) -> None:
"""Parse the PDF file"""
pass
@abc.abstractmethod
def _get_metadata(self, ) -> None:
"""Get the metadata of the PDF file"""
pass
def get_paragraphs(self, ) -> None:
"""Get the paragraphs of the PDF file"""
pass
@abc.abstractmethod
def get_split_paragraphs(self, ) -> None:
"""
Get the split paragraphs of the PDF file
Return:
split_paragraphs: dict of metadata and corresponding list of split paragraphs
"""
pass
def _determine_metadata_of_paragraph(self, paragraph) -> None:
"""
Determine the metadata of a paragraph
Return:
metadata: metadata of the paragraph
"""
pass
# @abc.abstractmethod
# def _determine_optimal_split_of_pargraphs(self, ) -> None:
# """
# Determine the optimal split of paragraphs
# Return:
# split_paragraphs: dict of metadata and corresponding list of split paragraphs
# """
# pass
class ChatbotEngine(metaclass=abc.ABCMeta):
def __init__(self,) -> None:
pass
@abc.abstractmethod
def query(self, user_query):
pass
|