import arxiv
import pandas as pd
import numpy as np
import cleaning as clean
from sklearn.base import TransformerMixin, BaseEstimator


class Fetch(BaseEstimator, TransformerMixin):
    def fit(self):
        return self

    def transform(self, X, y=None):
        return query_to_df(id_list=X)


class ArXivData:
    """A class for storing the metadata of a collection of arXiv papers."""

    def __init__(self) -> None:
        self._returned_metadata = None
        self.metadata = None
        self.arxiv_subjects = None
        self.doc_strings = "title and abstract"
        self.embeddings = None

    def load_from_feather(self, path_to_dataset):
        """Loads metadata from a saved feather file.

        Args:
            path_to_dataset: path to the feather file containing the dataset.
        """
        self._returned_metadata = pd.read_feather(path_to_dataset)
        self.metadata = self._returned_metadata
        self.arxiv_subjects = clean.OHE_arxiv_subjects(self.metadata)

    def load_from_query(self, query, max_results, offset=0):
        """Loads instance with data returned from an ArXiv API query.

        Args:
            query: query string used to call the API
            max_results: maximum number of results from the API call to return
            offset: number of results to skip over initially. Defaults to 0.
        """

        self._returned_metadata = query_to_df(
            query=query, max_results=max_results, offset=offset
        )
        self.metadata = clean.split_categories(self._returned_metadata)
        self.arxiv_subjects = clean.OHE_arxiv_subjects(self.metadata)

    def load_from_id_list(self, id_list):
        self._returned_metadata = query_to_df(id_list=id_list, max_results=len(id_list))
        self.metadata = clean.split_categories(self._returned_metadata)
        self.arxiv_subjects = clean.OHE_arxiv_subjects(self.metadata)

    def save_as_feather(self, path_to_dataset):
        """Saves a dataset as a feather file.

        Args:
            path_to_dataset: directory to save the dataset

        Raises:
            Exception: Raises exception if there is no data to be saved.
        """

        if self.metadata.empty:
            raise Exception(
                "No data stored. Run load_from_query or load_from_feather to retrieve data."
            )
        self.metadata.to_feather(path_to_dataset)


def query_to_df(query=None, id_list=None, max_results=10, offset=0):
    """Returns the results of an arxiv API query in a pandas dataframe.

    Args:
        query: string defining an arxiv query formatted according to
        https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction

        max_results: positive integer specifying the maximum number of results returned.

        id_list: A list of arxiv ids as strings to retrieve

    Returns:
        pandas dataframe with one column for indivial piece of metadata of a returned result.
        To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here:
        http://lukasschwab.me/arxiv.py/index.html#Result

        The 'links' column is dropped and the authors column is a list of each author's name as a string.
        The categories column is also a list of all tags appearing.
    """
    client = arxiv.Client(page_size=2000, num_retries=10)

    if id_list:
        max_results = len(id_list)
        search = arxiv.Search(
            id_list=id_list,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.LastUpdatedDate,
        )

    else:
        if not query:
            raise Exception(
                "You must pass either a query string or a list of arxiv IDs"
            )

        search = arxiv.Search(
            query=query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.LastUpdatedDate,
        )

    columns = ["title", "abstract", "authors", "categories", "id"]

    results = client.results(search, offset=offset)

    metadata_generator = (
        (
            result.title,
            result.summary,
            [author.name for author in result.authors],
            result.categories,
            result.entry_id.split("/")[-1],
        )
        for result in results
    )

    returned_metadata = pd.DataFrame(metadata_generator, columns=columns)
    return returned_metadata


# def format_query(author="", title="", cat="", abstract=""):
#     """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
#     leave the corresponding argument blank.

#     e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.

#     Args:
#         author: string to search for in the author field.
#         title: string to search for in the title field.
#         cat: A valid arxiv subject tag. See the full list of these at:
#         https://arxiv.org/category_taxonomy
#         abstract: string to search for in the abstract field.

#     Returns:
#         properly formatted query string to return all results simultaneously matching all specified fields.
#     """

#     tags = [f"au:{author}", f"ti:{title}", f"cat:{cat}", f"abs:{abstract}"]
#     query = " AND ".join([tag for tag in tags if not tag.endswith(":")])
#     return query