Anavya-TEXTSUMMARIZERADVANCE

Sleeping

App Files Files Community

Gladiator commited on Nov 13, 2021

Commit

b04763d

•

1 Parent(s): 02df788

add summarizer code

Browse files

Files changed (7) hide show

.gitignore +141 -0
src/vanilla_summarizer.py +83 -0
summarizer/bert_parent.py +169 -0
summarizer/cluster_features.py +165 -0
summarizer/coreference_handler.py +36 -0
summarizer/model_processors.py +401 -0
summarizer/sentence_handler.py +73 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,141 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# local stuff
+Docs/

src/vanilla_summarizer.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import streamlit as st
+from transformers import BartTokenizer, BartForConditionalGeneration
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+st.title('Text Summarization Demo')
+st.markdown('Using BART and T5 transformer model')
+model = st.selectbox('Select the model', ('BART', 'T5'))
+if model == 'BART':
+    _num_beams = 4
+    _no_repeat_ngram_size = 3
+    _length_penalty = 1
+    _min_length = 12
+    _max_length = 128
+    _early_stopping = True
+else:
+    _num_beams = 4
+    _no_repeat_ngram_size = 3
+    _length_penalty = 2
+    _min_length = 30
+    _max_length = 200
+    _early_stopping = True
+col1, col2, col3 = st.beta_columns(3)
+_num_beams = col1.number_input("num_beams", value=_num_beams)
+_no_repeat_ngram_size = col2.number_input("no_repeat_ngram_size", value=_no_repeat_ngram_size)
+_length_penalty = col3.number_input("length_penalty", value=_length_penalty)
+col1, col2, col3 = st.beta_columns(3)
+_min_length = col1.number_input("min_length", value=_min_length)
+_max_length = col2.number_input("max_length", value=_max_length)
+_early_stopping = col3.number_input("early_stopping", value=_early_stopping)
+text = st.text_area('Text Input')
+def run_model(input_text):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if model == "BART":
+        bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
+        bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
+        input_text = str(input_text)
+        input_text = ' '.join(input_text.split())
+        input_tokenized = bart_tokenizer.encode(input_text, return_tensors='pt').to(device)
+        summary_ids = bart_model.generate(input_tokenized,
+                                          num_beams=_num_beams,
+                                          no_repeat_ngram_size=_no_repeat_ngram_size,
+                                          length_penalty=_length_penalty,
+                                          min_length=_min_length,
+                                          max_length=_max_length,
+                                          early_stopping=_early_stopping)
+        output = [bart_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in
+                  summary_ids]
+        st.write('Summary')
+        st.success(output[0])
+    else:
+        t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
+        t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
+        input_text = str(input_text).replace('\n', '')
+        input_text = ' '.join(input_text.split())
+        input_tokenized = t5_tokenizer.encode(input_text, return_tensors="pt").to(device)
+        summary_task = torch.tensor([[21603, 10]]).to(device)
+        input_tokenized = torch.cat([summary_task, input_tokenized], dim=-1).to(device)
+        summary_ids = t5_model.generate(input_tokenized,
+                                        num_beams=_num_beams,
+                                        no_repeat_ngram_size=_no_repeat_ngram_size,
+                                        length_penalty=_length_penalty,
+                                        min_length=_min_length,
+                                        max_length=_max_length,
+                                        early_stopping=_early_stopping)
+        output = [t5_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in
+                  summary_ids]
+        st.write('Summary')
+        st.success(output[0])
+if st.button('Submit'):
+    run_model(text)

summarizer/bert_parent.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from typing import List, Union
+import numpy as np
+import torch
+from numpy import ndarray
+from transformers import (AlbertModel, AlbertTokenizer, BertModel,
+                          BertTokenizer, DistilBertModel, DistilBertTokenizer,
+                          PreTrainedModel, PreTrainedTokenizer, XLMModel,
+                          XLMTokenizer, XLNetModel, XLNetTokenizer)
+class BertParent(object):
+    """
+    Base handler for BERT models.
+    """
+    MODELS = {
+        'bert-base-uncased': (BertModel, BertTokenizer),
+        'bert-large-uncased': (BertModel, BertTokenizer),
+        'xlnet-base-cased': (XLNetModel, XLNetTokenizer),
+        'xlm-mlm-enfr-1024': (XLMModel, XLMTokenizer),
+        'distilbert-base-uncased': (DistilBertModel, DistilBertTokenizer),
+        'albert-base-v1': (AlbertModel, AlbertTokenizer),
+        'albert-large-v1': (AlbertModel, AlbertTokenizer)
+    }
+    def __init__(
+        self,
+        model: str,
+        custom_model: PreTrainedModel = None,
+        custom_tokenizer: PreTrainedTokenizer = None,
+        gpu_id: int = 0,
+    ):
+        """
+        :param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used.
+        :param custom_model: This is optional if a custom bert model is used.
+        :param custom_tokenizer: Place to use custom tokenizer.
+        """
+        base_model, base_tokenizer = self.MODELS.get(model, (None, None))
+        self.device = torch.device("cpu")
+        if torch.cuda.is_available():
+            assert (
+                isinstance(gpu_id, int) and (0 <= gpu_id and gpu_id < torch.cuda.device_count())
+            ), f"`gpu_id` must be an integer between 0 to {torch.cuda.device_count() - 1}. But got: {gpu_id}"
+            self.device = torch.device(f"cuda:{gpu_id}")
+        if custom_model:
+            self.model = custom_model.to(self.device)
+        else:
+            self.model = base_model.from_pretrained(
+                model, output_hidden_states=True).to(self.device)
+        if custom_tokenizer:
+            self.tokenizer = custom_tokenizer
+        else:
+            self.tokenizer = base_tokenizer.from_pretrained(model)
+        self.model.eval()
+    def tokenize_input(self, text: str) -> torch.tensor:
+        """
+        Tokenizes the text input.
+        :param text: Text to tokenize.
+        :return: Returns a torch tensor.
+        """
+        tokenized_text = self.tokenizer.tokenize(text)
+        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
+        return torch.tensor([indexed_tokens]).to(self.device)
+    def _pooled_handler(self, hidden: torch.Tensor,
+                        reduce_option: str) -> torch.Tensor:
+        """
+        Handles torch tensor.
+        :param hidden: The hidden torch tensor to process.
+        :param reduce_option: The reduce option to use, such as mean, etc.
+        :return: Returns a torch tensor.
+        """
+        if reduce_option == 'max':
+            return hidden.max(dim=1)[0].squeeze()
+        elif reduce_option == 'median':
+            return hidden.median(dim=1)[0].squeeze()
+        return hidden.mean(dim=1).squeeze()
+    def extract_embeddings(
+        self,
+        text: str,
+        hidden: Union[List[int], int] = -2,
+        reduce_option: str = 'mean',
+        hidden_concat: bool = False,
+    ) -> torch.Tensor:
+        """
+        Extracts the embeddings for the given text.
+        :param text: The text to extract embeddings for.
+        :param hidden: The hidden layer(s) to use for a readout handler.
+        :param squeeze: If we should squeeze the outputs (required for some layers).
+        :param reduce_option: How we should reduce the items.
+        :param hidden_concat: Whether or not to concat multiple hidden layers.
+        :return: A torch vector.
+        """
+        tokens_tensor = self.tokenize_input(text)
+        pooled, hidden_states = self.model(tokens_tensor)[-2:]
+        # deprecated temporary keyword functions.
+        if reduce_option == 'concat_last_4':
+            last_4 = [hidden_states[i] for i in (-1, -2, -3, -4)]
+            cat_hidden_states = torch.cat(tuple(last_4), dim=-1)
+            return torch.mean(cat_hidden_states, dim=1).squeeze()
+        elif reduce_option == 'reduce_last_4':
+            last_4 = [hidden_states[i] for i in (-1, -2, -3, -4)]
+            return torch.cat(tuple(last_4), dim=1).mean(axis=1).squeeze()
+        elif type(hidden) == int:
+            hidden_s = hidden_states[hidden]
+            return self._pooled_handler(hidden_s, reduce_option)
+        elif hidden_concat:
+            last_states = [hidden_states[i] for i in hidden]
+            cat_hidden_states = torch.cat(tuple(last_states), dim=-1)
+            return torch.mean(cat_hidden_states, dim=1).squeeze()
+        last_states = [hidden_states[i] for i in hidden]
+        hidden_s = torch.cat(tuple(last_states), dim=1)
+        return self._pooled_handler(hidden_s, reduce_option)
+    def create_matrix(
+        self,
+        content: List[str],
+        hidden: Union[List[int], int] = -2,
+        reduce_option: str = 'mean',
+        hidden_concat: bool = False,
+    ) -> ndarray:
+        """
+        Create matrix from the embeddings.
+        :param content: The list of sentences.
+        :param hidden: Which hidden layer to use.
+        :param reduce_option: The reduce option to run.
+        :param hidden_concat: Whether or not to concat multiple hidden layers.
+        :return: A numpy array matrix of the given content.
+        """
+        return np.asarray([
+            np.squeeze(self.extract_embeddings(
+                t, hidden=hidden, reduce_option=reduce_option, hidden_concat=hidden_concat
+            ).data.cpu().numpy()) for t in content
+        ])
+    def __call__(
+        self,
+        content: List[str],
+        hidden: int = -2,
+        reduce_option: str = 'mean',
+        hidden_concat: bool = False,
+    ) -> ndarray:
+        """
+        Create matrix from the embeddings.
+        :param content: The list of sentences.
+        :param hidden: Which hidden layer to use.
+        :param reduce_option: The reduce option to run.
+        :param hidden_concat: Whether or not to concat multiple hidden layers.
+        :return: A numpy array matrix of the given content.
+        """
+        return self.create_matrix(content, hidden, reduce_option, hidden_concat)

summarizer/cluster_features.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from typing import Dict, List
+import numpy as np
+from numpy import ndarray
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+from sklearn.mixture import GaussianMixture
+class ClusterFeatures(object):
+    """
+    Basic handling of clustering features.
+    """
+    def __init__(
+        self,
+        features: ndarray,
+        algorithm: str = 'kmeans',
+        pca_k: int = None,
+        random_state: int = 12345,
+    ):
+        """
+        :param features: the embedding matrix created by bert parent.
+        :param algorithm: Which clustering algorithm to use.
+        :param pca_k: If you want the features to be ran through pca, this is the components number.
+        :param random_state: Random state.
+        """
+        if pca_k:
+            self.features = PCA(n_components=pca_k).fit_transform(features)
+        else:
+            self.features = features
+        self.algorithm = algorithm
+        self.pca_k = pca_k
+        self.random_state = random_state
+    def __get_model(self, k: int):
+        """
+        Retrieve clustering model.
+        :param k: amount of clusters.
+        :return: Clustering model.
+        """
+        if self.algorithm == 'gmm':
+            return GaussianMixture(n_components=k, random_state=self.random_state)
+        return KMeans(n_clusters=k, random_state=self.random_state)
+    def __get_centroids(self, model):
+        """
+        Retrieve centroids of model.
+        :param model: Clustering model.
+        :return: Centroids.
+        """
+        if self.algorithm == 'gmm':
+            return model.means_
+        return model.cluster_centers_
+    def __find_closest_args(self, centroids: np.ndarray) -> Dict:
+        """
+        Find the closest arguments to centroid.
+        :param centroids: Centroids to find closest.
+        :return: Closest arguments.
+        """
+        centroid_min = 1e10
+        cur_arg = -1
+        args = {}
+        used_idx = []
+        for j, centroid in enumerate(centroids):
+            for i, feature in enumerate(self.features):
+                value = np.linalg.norm(feature - centroid)
+                if value < centroid_min and i not in used_idx:
+                    cur_arg = i
+                    centroid_min = value
+            used_idx.append(cur_arg)
+            args[j] = cur_arg
+            centroid_min = 1e10
+            cur_arg = -1
+        return args
+    def calculate_elbow(self, k_max: int) -> List[float]:
+        """
+        Calculates elbow up to the provided k_max.
+        :param k_max: K_max to calculate elbow for.
+        :return: The inertias up to k_max.
+        """
+        inertias = []
+        for k in range(1, min(k_max, len(self.features))):
+            model = self.__get_model(k).fit(self.features)
+            inertias.append(model.inertia_)
+        return inertias
+    def calculate_optimal_cluster(self, k_max: int):
+        """
+        Calculates the optimal cluster based on Elbow.
+        :param k_max: The max k to search elbow for.
+        :return: The optimal cluster size.
+        """
+        delta_1 = []
+        delta_2 = []
+        max_strength = 0
+        k = 1
+        inertias = self.calculate_elbow(k_max)
+        for i in range(len(inertias)):
+            delta_1.append(inertias[i] - inertias[i - 1] if i > 0 else 0.0)
+            delta_2.append(delta_1[i] - delta_1[i - 1] if i > 1 else 0.0)
+        for j in range(len(inertias)):
+            strength = 0 if j <= 1 or j == len(inertias) - 1 else delta_2[j + 1] - delta_1[j + 1]
+            if strength > max_strength:
+                max_strength = strength
+                k = j + 1
+        return k
+    def cluster(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
+        """
+        Clusters sentences based on the ratio.
+        :param ratio: Ratio to use for clustering.
+        :param num_sentences: Number of sentences. Overrides ratio.
+        :return: Sentences index that qualify for summary.
+        """
+        if num_sentences is not None:
+            if num_sentences == 0:
+                return []
+            k = min(num_sentences, len(self.features))
+        else:
+            k = max(int(len(self.features) * ratio), 1)
+        model = self.__get_model(k).fit(self.features)
+        centroids = self.__get_centroids(model)
+        cluster_args = self.__find_closest_args(centroids)
+        sorted_values = sorted(cluster_args.values())
+        return sorted_values
+    def __call__(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
+        """
+        Clusters sentences based on the ratio.
+        :param ratio: Ratio to use for clustering.
+        :param num_sentences: Number of sentences. Overrides ratio.
+        :return: Sentences index that qualify for summary.
+        """
+        return self.cluster(ratio)

summarizer/coreference_handler.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# removed previous import and related functionality since it's just a blank language model,
+#  while neuralcoref requires passing pretrained language model via spacy.load()
+import neuralcoref
+import spacy
+from summarizer.sentence_handler import SentenceHandler
+class CoreferenceHandler(SentenceHandler):
+    def __init__(self, spacy_model: str = 'en_core_web_sm',
+                 greedyness: float = 0.45):
+        """
+        Corefence handler. Only works with spacy < 3.0.
+        :param spacy_model: The spacy model to use as default.
+        :param greedyness: The greedyness factor.
+        """
+        self.nlp = spacy.load(spacy_model)
+        neuralcoref.add_to_pipe(self.nlp, greedyness=greedyness)
+    def process(self, body: str, min_length: int = 40, max_length: int = 600):
+        """
+        Processes the content sentences.
+        :param body: The raw string body to process
+        :param min_length: Minimum length that the sentences must be
+        :param max_length: Max length that the sentences mus fall under
+        :return: Returns a list of sentences.
+        """
+        doc = self.nlp(body)._.coref_resolved
+        doc = self.nlp(doc)
+        return [c.string.strip()
+                for c in doc.sents
+                if max_length > len(c.string.strip()) > min_length]

summarizer/model_processors.py ADDED Viewed

	@@ -0,0 +1,401 @@

+from typing import List, Optional, Tuple, Union
+import numpy as np
+from transformers import (AlbertModel, AlbertTokenizer, BartModel,
+                          BartTokenizer, BertModel, BertTokenizer,
+                          CamembertModel, CamembertTokenizer, CTRLModel,
+                          CTRLTokenizer, DistilBertModel, DistilBertTokenizer,
+                          GPT2Model, GPT2Tokenizer, LongformerModel,
+                          LongformerTokenizer, OpenAIGPTModel,
+                          OpenAIGPTTokenizer, PreTrainedModel,
+                          PreTrainedTokenizer, RobertaModel, RobertaTokenizer,
+                          TransfoXLModel, TransfoXLTokenizer, XLMModel,
+                          XLMTokenizer, XLNetModel, XLNetTokenizer)
+from summarizer.bert_parent import BertParent
+from summarizer.cluster_features import ClusterFeatures
+from summarizer.sentence_handler import SentenceHandler
+class ModelProcessor(object):
+    aggregate_map = {
+        'mean': np.mean,
+        'min': np.min,
+        'median': np.median,
+        'max': np.max,
+    }
+    def __init__(
+        self,
+        model: str = 'bert-large-uncased',
+        custom_model: PreTrainedModel = None,
+        custom_tokenizer: PreTrainedTokenizer = None,
+        hidden: Union[List[int], int] = -2,
+        reduce_option: str = 'mean',
+        sentence_handler: SentenceHandler = SentenceHandler(),
+        random_state: int = 12345,
+        hidden_concat: bool = False,
+        gpu_id: int = 0,
+    ):
+        """
+        This is the parent Bert Summarizer model. New methods should implement this class.
+        :param model: This parameter is associated with the inherit string parameters from the transformers library.
+        :param custom_model: If you have a pre-trained model, you can add the model class here.
+        :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
+        :param hidden: This signifies which layer(s) of the BERT model you would like to use as embeddings.
+        :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
+        :param sentence_handler: The handler to process sentences. If want to use coreference, instantiate and pass.
+        CoreferenceHandler instance
+        :param random_state: The random state to reproduce summarizations.
+        :param hidden_concat: Whether or not to concat multiple hidden layers.
+        :param gpu_id: GPU device index if CUDA is available.
+        """
+        np.random.seed(random_state)
+        self.model = BertParent(model, custom_model, custom_tokenizer, gpu_id)
+        self.hidden = hidden
+        self.reduce_option = reduce_option
+        self.sentence_handler = sentence_handler
+        self.random_state = random_state
+        self.hidden_concat = hidden_concat
+    def cluster_runner(
+        self,
+        content: List[str],
+        ratio: float = 0.2,
+        algorithm: str = 'kmeans',
+        use_first: bool = True,
+        num_sentences: int = None
+    ) -> Tuple[List[str], np.ndarray]:
+        """
+        Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.
+        :param content: Content list of sentences.
+        :param ratio: The ratio to use for clustering.
+        :param algorithm: Type of algorithm to use for clustering.
+        :param use_first: Return the first sentence in the output (helpful for news stories, etc).
+        :param num_sentences: Number of sentences to use for summarization.
+        :return: A tuple of summarized sentences and embeddings
+        """
+        if num_sentences is not None:
+            num_sentences = num_sentences if use_first else num_sentences
+        hidden = self.model(
+            content, self.hidden, self.reduce_option, hidden_concat=self.hidden_concat)
+        hidden_args = ClusterFeatures(
+            hidden, algorithm, random_state=self.random_state).cluster(ratio, num_sentences)
+        if use_first:
+            if not hidden_args:
+                hidden_args.append(0)
+            elif hidden_args[0] != 0:
+                hidden_args.insert(0, 0)
+        sentences = [content[j] for j in hidden_args]
+        embeddings = np.asarray([hidden[j] for j in hidden_args])
+        return sentences, embeddings
+    def __run_clusters(
+        self,
+        content: List[str],
+        ratio: float = 0.2,
+        algorithm: str = 'kmeans',
+        use_first: bool = True,
+        num_sentences: int = None
+    ) -> List[str]:
+        """
+        Runs clusters and returns sentences.
+        :param content: The content of sentences.
+        :param ratio: Ratio to use for for clustering.
+        :param algorithm: Algorithm selection for clustering.
+        :param use_first: Whether to use first sentence
+        :param num_sentences: Number of sentences. Overrides ratio.
+        :return: summarized sentences
+        """
+        sentences, _ = self.cluster_runner(
+            content, ratio, algorithm, use_first, num_sentences)
+        return sentences
+    def __retrieve_summarized_embeddings(
+        self,
+        content: List[str],
+        ratio: float = 0.2,
+        algorithm: str = 'kmeans',
+        use_first: bool = True,
+        num_sentences: int = None
+    ) -> np.ndarray:
+        """
+        Retrieves embeddings of the summarized sentences.
+        :param content: The content of sentences.
+        :param ratio: Ratio to use for for clustering.
+        :param algorithm: Algorithm selection for clustering.
+        :param use_first: Whether to use first sentence
+        :return: Summarized embeddings
+        """
+        _, embeddings = self.cluster_runner(
+            content, ratio, algorithm, use_first, num_sentences)
+        return embeddings
+    def calculate_elbow(
+        self,
+        body: str,
+        algorithm: str = 'kmeans',
+        min_length: int = 40,
+        max_length: int = 600,
+        k_max: int = None,
+    ) -> List[float]:
+        """
+        Calculates elbow across the clusters.
+        :param body: The input body to summarize.
+        :param algorithm: The algorithm to use for clustering.
+        :param min_length: The min length to use.
+        :param max_length: The max length to use.
+        :param k_max: The maximum number of clusters to search.
+        :return: List of elbow inertia values.
+        """
+        sentences = self.sentence_handler(body, min_length, max_length)
+        if k_max is None:
+            k_max = len(sentences) - 1
+        hidden = self.model(sentences, self.hidden,
+                            self.reduce_option, hidden_concat=self.hidden_concat)
+        elbow = ClusterFeatures(
+            hidden, algorithm, random_state=self.random_state).calculate_elbow(k_max)
+        return elbow
+    def calculate_optimal_k(
+        self,
+        body: str,
+        algorithm: str = 'kmeans',
+        min_length: int = 40,
+        max_length: int = 600,
+        k_max: int = None,
+    ):
+        """
+        Calculates the optimal Elbow K.
+        :param body: The input body to summarize.
+        :param algorithm: The algorithm to use for clustering.
+        :param min_length: The min length to use.
+        :param max_length: The max length to use.
+        :param k_max: The maximum number of clusters to search.
+        :return:
+        """
+        sentences = self.sentence_handler(body, min_length, max_length)
+        if k_max is None:
+            k_max = len(sentences) - 1
+        hidden = self.model(sentences, self.hidden,
+                            self.reduce_option, hidden_concat=self.hidden_concat)
+        optimal_k = ClusterFeatures(
+            hidden, algorithm, random_state=self.random_state).calculate_optimal_cluster(k_max)
+        return optimal_k
+    def run_embeddings(
+        self,
+        body: str,
+        ratio: float = 0.2,
+        min_length: int = 40,
+        max_length: int = 600,
+        use_first: bool = True,
+        algorithm: str = 'kmeans',
+        num_sentences: int = None,
+        aggregate: str = None,
+    ) -> Optional[np.ndarray]:
+        """
+        Preprocesses the sentences, runs the clusters to find the centroids, then combines the embeddings.
+        :param body: The raw string body to process
+        :param ratio: Ratio of sentences to use
+        :param min_length: Minimum length of sentence candidates to utilize for the summary.
+        :param max_length: Maximum length of sentence candidates to utilize for the summary
+        :param use_first: Whether or not to use the first sentence
+        :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
+        :param num_sentences: Number of sentences to use. Overrides ratio.
+        :param aggregate: One of mean, median, max, min. Applied on zero axis
+        :return: A summary embedding
+        """
+        sentences = self.sentence_handler(body, min_length, max_length)
+        if sentences:
+            embeddings = self.__retrieve_summarized_embeddings(
+                sentences, ratio, algorithm, use_first, num_sentences)
+            if aggregate is not None:
+                assert aggregate in [
+                    'mean', 'median', 'max', 'min'], "aggregate must be mean, min, max, or median"
+                embeddings = self.aggregate_map[aggregate](embeddings, axis=0)
+            return embeddings
+        return None
+    def run(
+        self,
+        body: str,
+        ratio: float = 0.2,
+        min_length: int = 40,
+        max_length: int = 600,
+        use_first: bool = True,
+        algorithm: str = 'kmeans',
+        num_sentences: int = None,
+        return_as_list: bool = False
+    ) -> Union[List, str]:
+        """
+        Preprocesses the sentences, runs the clusters to find the centroids, then combines the sentences.
+        :param body: The raw string body to process
+        :param ratio: Ratio of sentences to use
+        :param min_length: Minimum length of sentence candidates to utilize for the summary.
+        :param max_length: Maximum length of sentence candidates to utilize for the summary
+        :param use_first: Whether or not to use the first sentence
+        :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
+        :param num_sentences: Number of sentences to use (overrides ratio).
+        :param return_as_list: Whether or not to return sentences as list.
+        :return: A summary sentence
+        """
+        sentences = self.sentence_handler(body, min_length, max_length)
+        if sentences:
+            sentences = self.__run_clusters(
+                sentences, ratio, algorithm, use_first, num_sentences)
+        if return_as_list:
+            return sentences
+        else:
+            return ' '.join(sentences)
+    def __call__(
+        self,
+        body: str,
+        ratio: float = 0.2,
+        min_length: int = 40,
+        max_length: int = 600,
+        use_first: bool = True,
+        algorithm: str = 'kmeans',
+        num_sentences: int = None,
+        return_as_list: bool = False,
+    ) -> str:
+        """
+        (utility that wraps around the run function)
+        Preprocesses the sentences, runs the clusters to find the centroids, then combines the sentences.
+        :param body: The raw string body to process.
+        :param ratio: Ratio of sentences to use.
+        :param min_length: Minimum length of sentence candidates to utilize for the summary.
+        :param max_length: Maximum length of sentence candidates to utilize for the summary.
+        :param use_first: Whether or not to use the first sentence.
+        :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
+        :param Number of sentences to use (overrides ratio).
+        :param return_as_list: Whether or not to return sentences as list.
+        :return: A summary sentence.
+        """
+        return self.run(
+            body, ratio, min_length, max_length, algorithm=algorithm, use_first=use_first, num_sentences=num_sentences,
+            return_as_list=return_as_list
+        )
+class Summarizer(ModelProcessor):
+    def __init__(
+        self,
+        model: str = 'bert-large-uncased',
+        custom_model: PreTrainedModel = None,
+        custom_tokenizer: PreTrainedTokenizer = None,
+        hidden: Union[List[int], int] = -2,
+        reduce_option: str = 'mean',
+        sentence_handler: SentenceHandler = SentenceHandler(),
+        random_state: int = 12345,
+        hidden_concat: bool = False,
+        gpu_id: int = 0,
+    ):
+        """
+        This is the main Bert Summarizer class.
+        :param model: This parameter is associated with the inherit string parameters from the transformers library.
+        :param custom_model: If you have a pre-trained model, you can add the model class here.
+        :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
+        :param hidden: This signifies which layer of the BERT model you would like to use as embeddings.
+        :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
+        :param greedyness: associated with the neuralcoref library. Determines how greedy coref should be.
+        :param language: Which language to use for training.
+        :param random_state: The random state to reproduce summarizations.
+        :param hidden_concat: Whether or not to concat multiple hidden layers.
+        :param gpu_id: GPU device index if CUDA is available.
+        """
+        super(Summarizer, self).__init__(
+            model, custom_model, custom_tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat, gpu_id
+        )
+class TransformerSummarizer(ModelProcessor):
+    """
+    Newer style that has keywords for models and tokenizers, but allows the user to change the type.
+    """
+    MODEL_DICT = {
+        'Bert': (BertModel, BertTokenizer),
+        'OpenAIGPT': (OpenAIGPTModel, OpenAIGPTTokenizer),
+        'GPT2': (GPT2Model, GPT2Tokenizer),
+        'CTRL': (CTRLModel, CTRLTokenizer),
+        'TransfoXL': (TransfoXLModel, TransfoXLTokenizer),
+        'XLNet': (XLNetModel, XLNetTokenizer),
+        'XLM': (XLMModel, XLMTokenizer),
+        'DistilBert': (DistilBertModel, DistilBertTokenizer),
+    }
+    def __init__(
+        self,
+        transformer_type: str = 'Bert',
+        transformer_model_key: str = 'bert-base-uncased',
+        transformer_tokenizer_key: str = None,
+        hidden: Union[List[int], int] = -2,
+        reduce_option: str = 'mean',
+        sentence_handler: SentenceHandler = SentenceHandler(),
+        random_state: int = 12345,
+        hidden_concat: bool = False,
+        gpu_id: int = 0,
+    ):
+        """
+        :param transformer_type: The Transformer type, such as Bert, GPT2, DistilBert, etc.
+        :param transformer_model_key: The transformer model key. This is the directory for the model.
+        :param transformer_tokenizer_key: The transformer tokenizer key. This is the tokenizer directory.
+        :param hidden: The hidden output layers to use for the summarization.
+        :param reduce_option: The reduce option, such as mean, max, min, median, etc.
+        :param sentence_handler: The sentence handler class to process the raw text.
+        :param random_state: The random state to use.
+        :param hidden_concat: Deprecated hidden concat option.
+        :param gpu_id: GPU device index if CUDA is available.
+        """
+        try:
+            self.MODEL_DICT['Roberta'] = (RobertaModel, RobertaTokenizer)
+            self.MODEL_DICT['Albert'] = (AlbertModel, AlbertTokenizer)
+            self.MODEL_DICT['Camembert'] = (CamembertModel, CamembertTokenizer)
+            self.MODEL_DICT['Bart'] = (BartModel, BartTokenizer)
+            self.MODEL_DICT['Longformer'] = (LongformerModel, LongformerTokenizer)
+        except Exception:
+            pass  # older transformer version
+        model_clz, tokenizer_clz = self.MODEL_DICT[transformer_type]
+        model = model_clz.from_pretrained(
+            transformer_model_key, output_hidden_states=True)
+        tokenizer = tokenizer_clz.from_pretrained(
+            transformer_tokenizer_key if transformer_tokenizer_key is not None else transformer_model_key
+        )
+        super().__init__(
+            None, model, tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat, gpu_id
+        )

summarizer/sentence_handler.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from typing import List
+from spacy.lang.en import English
+class SentenceHandler(object):
+    def __init__(self, language=English):
+        """
+        Base Sentence Handler with Spacy support.
+        :param language: Determines the language to use with spacy.
+        """
+        self.nlp = language()
+        try:
+            # Supports spacy 2.0
+            self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
+            self.is_spacy_3 = False
+        except Exception:
+            # Supports spacy 3.0
+            self.nlp.add_pipe("sentencizer")
+            self.is_spacy_3 = True
+    def sentence_processor(self, doc,
+                           min_length: int = 40,
+                           max_length: int = 600) -> List[str]:
+        """
+        Processes a given spacy document and turns them into sentences.
+        :param doc: The document to use from spacy.
+        :param min_length: The minimum length a sentence should be to be considered.
+        :param max_length: The maximum length a sentence should be to be considered.
+        :return: Sentences.
+        """
+        to_return = []
+        for c in doc.sents:
+            if max_length > len(c.text.strip()) > min_length:
+                if self.is_spacy_3:
+                    to_return.append(c.text.strip())
+                else:
+                    to_return.append(c.string.strip())
+        return to_return
+    def process(self, body: str,
+                min_length: int = 40,
+                max_length: int = 600) -> List[str]:
+        """
+        Processes the content sentences.
+        :param body: The raw string body to process
+        :param min_length: Minimum length that the sentences must be
+        :param max_length: Max length that the sentences mus fall under
+        :return: Returns a list of sentences.
+        """
+        doc = self.nlp(body)
+        return self.sentence_processor(doc, min_length, max_length)
+    def __call__(self, body: str,
+                 min_length: int = 40,
+                 max_length: int = 600) -> List[str]:
+        """
+        Processes the content sentences.
+        :param body: The raw string body to process
+        :param min_length: Minimum length that the sentences must be
+        :param max_length: Max length that the sentences mus fall under
+        :return: Returns a list of sentences.
+        """
+        return self.process(body, min_length, max_length)