{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pre-requisites\n",
    "\n",
    "- Python 3.5+\n",
    "- Python packages: \n",
    "    - `pip install bs4 pandas mmh3`\n",
    "- [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library)\n",
    "- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Initialize the Indic NLP Library\n",
    "\n",
    "Run the cell below to initialize the Indic NLP Library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The path to the local git repo for Indic NLP Library\n",
    "INDIC_NLP_LIB_HOME=\"/disk1/src/indic_nlp_library\"\n",
    "\n",
    "# The path to the local git repo for Indic NLP Resources\n",
    "INDIC_NLP_RESOURCES=\"/disk1/src/indic_nlp_resources\"\n",
    "\n",
    "import sys\n",
    "sys.path.append('{}/src'.format(INDIC_NLP_LIB_HOME))\n",
    "\n",
    "from indicnlp import common\n",
    "common.set_resources_path(INDIC_NLP_RESOURCES)\n",
    "\n",
    "from indicnlp import loader\n",
    "loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import os\n",
    "import string\n",
    "import indicnlp\n",
    "from indicnlp.tokenize import indic_tokenize\n",
    "from indicnlp.normalize import indic_normalize\n",
    "from indicnlp.transliterate import unicode_transliterate\n",
    "from indicnlp.tokenize import sentence_tokenize\n",
    "import re\n",
    "import collections\n",
    "import random\n",
    "import mmh3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Common Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_sent(text,lang,normalizer):\n",
    "    \"\"\"\n",
    "        Pre-process text (normalization and tokenization)\n",
    "        \n",
    "        text: text string to preprocess\n",
    "        lang: language code (2-letter ISO code)\n",
    "        normalizer: normalizer object for language\n",
    "        \n",
    "        returns the processed text string\n",
    "    \"\"\"\n",
    "    return ' '.join(indic_tokenize.trivial_tokenize(normalizer.normalize(text.replace('\\n',' ')),lang)) \n",
    "\n",
    "def sent_split(text,lang):\n",
    "    \"\"\"\n",
    "        Sentence splitter\n",
    "        \n",
    "        text: text to sentence split \n",
    "        lang: language\n",
    "        \n",
    "        returns list of sentences \n",
    "    \"\"\"\n",
    "    return  sentence_tokenize.sentence_split(text,lang)\n",
    "\n",
    "def extract_all_content(indir,lang,\n",
    "                        article_extract_fn,\n",
    "                        preprocess_fn=preprocess_sent,\n",
    "                        narticles=-1,\n",
    "                        start_artid=0):\n",
    "    \"\"\"\n",
    "    This method reads all files from the input directory, extracts text content from each file,\n",
    "    and pre-processes the text. This method is a generator. \n",
    "    For each sentence, the method yields a tuple of the format: \n",
    "    \n",
    "    (artid, fname, paraid, sentid, processed_text)\n",
    "    \n",
    "    indir: path to input directoryo containing files to be parsed \n",
    "    \n",
    "    lang: language to the files in the input directory\n",
    "    \n",
    "    article_extract_fn: the function to extract text content from each file. \n",
    "        Signature of the function: get_article_contents(fname,lang,encoding) \n",
    "          `fname` is name of the file, `lang` is langcode, \n",
    "          `encoding` is text-encoding (default=utf-8). \n",
    "        The function yields a tuple (paraid, sentid, extracted_text) \n",
    "        for each  sentence.\n",
    "        \n",
    "    preprocess_fn: pre-processing function to apply to the extracted text. \n",
    "        The function takes a string as input and returns processed string as output.\n",
    "        \n",
    "    narticles: extract and process the first `narticles` from input directory. \n",
    "        if narticles=-1 (default), all files are extracted\n",
    "        \n",
    "    start_artid: the start of the article id to assign to extracted articles (default=0)\n",
    "    \n",
    "    \"\"\"\n",
    "\n",
    "    fnames = os.listdir(indir)\n",
    "    if narticles>0:\n",
    "        fnames=fnames[:narticles]\n",
    "    nsent=0\n",
    "\n",
    "    normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
    "    normalizer=normalizer_factory.get_normalizer(lang)\n",
    "             \n",
    "    print('Number of articles: {}'.format(len(fnames)))\n",
    "    for artid, fname in enumerate(fnames,start_artid):\n",
    "#         print(fname)\n",
    "        if artid%100 == 0:\n",
    "            print('({}|{})'.format(artid,nsent),end=' ... ')\n",
    "        \n",
    "        try:\n",
    "            fpath=os.sep.join([indir,fname])\n",
    "            for paraid, sentid, sent in article_extract_fn(fpath,lang):\n",
    "                nsent+=1\n",
    "                yield( ( artid, fname, paraid, sentid, preprocess_fn(sent,lang,normalizer) ) )\n",
    "        except:\n",
    "            print('Cannot parse {}'.format(fname))\n",
    "                \n",
    "def write_corpus(corpus_iterator,content_fname,article_mapping_fname,delimiter=' ||| ', encoding='utf-8'):\n",
    "    \"\"\"\n",
    "    Writes the extracted corpus to a file. The extracted data is organized in terms of articles, paragraphs \n",
    "    and sentences. The following is the format of the output file: \n",
    "        - one line per sentence\n",
    "        - format of line: article_id, para_id, sent_id, sentence\n",
    "    In addition to the content file mention, a metadata file which maps the article id to the filename is also written. \n",
    "    \n",
    "    corpus_iterator: iterator over the corpus, yielding tuple (artid, fname, paraid, sentid, processed_text). \n",
    "        The function `extract_all_content` yields a generator in this format. \n",
    "    content_fname: output content file to write the extracted data to in the format mentioned above\n",
    "    article_mapping_fname: output metadata file to write article id to filename mapping.\n",
    "    delimiter=' ||| ': delimiter for the content file. The default delimiter is the same \n",
    "                        as used in the Moses phrase table\n",
    "    encoding: text encoding default - 'utf-8'\n",
    "    \n",
    "    \"\"\"\n",
    "    \n",
    "    artid_name_mapping={}\n",
    "    with open(content_fname,'w',encoding=encoding) as contentfile:\n",
    "        for artid, fname, paraid, sentid, text in corpus_iterator:\n",
    "            contentfile.write(delimiter.join([str(artid), str(paraid), str(sentid), text]) + '\\n')\n",
    "            artid_name_mapping[artid]=fname\n",
    "\n",
    "    with open(article_mapping_fname,'w',encoding=encoding) as artmappingfile:\n",
    "        for artid, name in sorted(artid_name_mapping.items(),key=lambda x: x[0]):\n",
    "            artmappingfile.write('{} {} {}\\n'.format(artid,delimiter,name))\n",
    "\n",
    "def convert_txt_to_csv_format(infname, outfname, encoding='utf-8'):\n",
    "    \"\"\"\n",
    "    convert txt file to csv format. This method is used when the text file is directly available.\n",
    "    The input file has one sentence per line. Assumed to be preprocessed (tokenized, normalized)\n",
    "    \n",
    "    \"\"\"\n",
    "    with open(infname,'r',encoding=encoding) as infile, \\\n",
    "         open(outfname,'w',encoding=encoding) as outfile: \n",
    "        for i, line in enumerate(infile):\n",
    "            outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,line.strip()))\n",
    "            \n",
    "def preprocess_convert_txt_to_csv_format(infname, outfname, lang, encoding='utf-8'):\n",
    "    \"\"\"\n",
    "    Convert raw text file to csv format\n",
    "    \"\"\"\n",
    "    \n",
    "    normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
    "    normalizer=normalizer_factory.get_normalizer(lang)\n",
    "    \n",
    "    with open(infname,'r',encoding=encoding) as infile, \\\n",
    "         open(outfname,'w',encoding=encoding) as outfile: \n",
    "        i=0\n",
    "        for line in infile:\n",
    "            sents = sent_split(line.strip(),lang)\n",
    "            for sent in sents:\n",
    "                outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,\n",
    "                                                    preprocess_sent(sent.strip(), lang, normalizer)) )\n",
    "                i=i+1\n",
    "\n",
    "def print_txt(infnames, outfname, encoding='utf-8'):\n",
    "    \"\"\"\n",
    "    Extract only the text from the content csv file. The output file has one sentence per file.\n",
    "    \"\"\"\n",
    "    with open(outfname,'w',encoding=encoding) as outfile: \n",
    "        for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
    "            with open(infname,'r',encoding=encoding) as infile:\n",
    "                for i, line in enumerate(infile):\n",
    "                    fields=line.strip().split('|||')\n",
    "                    if len(fields) >=4:\n",
    "                        outfile.write('{}\\n'.format(fields[3].strip()))\n",
    "                        \n",
    "# def dedup_and_print_txt(infnames, outfname, encoding='utf-8'):\n",
    "    \n",
    "#     total=0\n",
    "#     unique=0\n",
    "#     hash_codes=set()\n",
    "    \n",
    "#     with open(outfname,'w',encoding=encoding) as outfile: \n",
    "#         for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
    "#             with open(infname,'r',encoding=encoding) as infile:\n",
    "#                 for i, line in enumerate(infile):\n",
    "#                     fields=line.strip().split('|||')\n",
    "#                     if len(fields) >=4:\n",
    "#                         sent=fields[3].strip()\n",
    "#                         total+=1\n",
    "#                         hs=hash(sent)\n",
    "#                         if hs not in hash_codes:\n",
    "#                             outfile.write('{}\\n'.format(sent))\n",
    "#                             hash_codes.add(hs)\n",
    "#                             unique+=1\n",
    "    \n",
    "#     print('Total: {}'.format(total))\n",
    "#     print('Unique: {}'.format(unique))\n",
    "\n",
    "def dedup_shuffle_and_print_txt(infnames, outfname, max_buf_size=100000,encoding='utf-8'):\n",
    "    \"\"\"\n",
    "    The method creates a sentence level corpora from multiple content csv files.\n",
    "    All sentences are extracted, they are de-duplicated using murmurhash and shuffled\n",
    "    before writing the entire corpus to the output file. The output file has one sentence per line.\n",
    "\n",
    "    \"\"\"\n",
    "    \n",
    "    total=0\n",
    "    unique=0\n",
    "    hash_codes=set()\n",
    "    sent_buffer=[]\n",
    "    \n",
    "    with open(outfname,'w',encoding=encoding) as outfile: \n",
    "        for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
    "            print('Processing: {}'.format(infname))\n",
    "            with open(infname,'r',encoding=encoding) as infile:\n",
    "                for i, line in enumerate(infile):\n",
    "                    fields=line.strip().split('|||')\n",
    "                    if len(fields) >=4:\n",
    "                        sent=fields[3].strip()\n",
    "                        total+=1\n",
    "#                         hs=hash(sent)\n",
    "                        hs=mmh3.hash128(sent)\n",
    "                        if hs not in hash_codes:\n",
    "#                             outfile.write('{}\\n'.format(sent))\n",
    "                            sent_buffer.append(sent)\n",
    "                            hash_codes.add(hs)\n",
    "                            unique+=1\n",
    "                    if len(sent_buffer)>=max_buf_size:\n",
    "                        random.shuffle(sent_buffer)\n",
    "                        for sent in sent_buffer: \n",
    "                            outfile.write('{}\\n'.format(sent))\n",
    "                        sent_buffer.clear()\n",
    "                \n",
    "        if len(sent_buffer)>0:\n",
    "            random.shuffle(sent_buffer)\n",
    "            for sent in sent_buffer: \n",
    "                outfile.write('{}\\n'.format(sent))\n",
    "            sent_buffer.clear()                    \n",
    "                        \n",
    "    print('Total: {}'.format(total))\n",
    "    print('Unique: {}'.format(unique))\n",
    "\n",
    "def extract_wikiextractor_file(infname, outfname, lang, \n",
    "                               encoding='utf-8', delimiter=' ||| ', preprocess_fn=preprocess_sent):\n",
    "    \"\"\"\n",
    "    Extract text content into a content csv file from wikipedia article page. \n",
    "    The wikipedia article page is the output from `wikiextractor` [https://github.com/attardi/wikiextractor] \n",
    "    \n",
    "    \"\"\"\n",
    "    normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
    "    normalizer=normalizer_factory.get_normalizer(lang)\n",
    "    \n",
    "    with open(infname,'r',encoding=encoding) as infile, \\\n",
    "         open(outfname,'w',encoding=encoding) as outfile: \n",
    "        artid=-1\n",
    "        paraid=0\n",
    "        for line in infile:\n",
    "            if line.find('<doc')==0:\n",
    "                artid+=1\n",
    "                paraid=0\n",
    "                continue\n",
    "            if line.find('</doc')==0:\n",
    "                continue\n",
    "            if len(line.strip())>0:\n",
    "                for sentid, sent in enumerate(sent_split(line.strip(),lang)):\n",
    "                    sent=sent.strip()\n",
    "                    if sent!='':\n",
    "                        sent = preprocess_fn(sent,lang,normalizer)\n",
    "                        outfile.write(delimiter.join([str(artid), str(paraid), str(sentid), sent]) + '\\n')\n",
    "                paraid+=1\n",
    "\n",
    "                \n",
    "def extract_leipzig_corpus(infname,outfname,lang,encoding='utf-8'):\n",
    "    \"\"\"\n",
    "    Extractor for files form the Leipzig corpus\n",
    "    [http://wortschatz.uni-leipzig.de/en/download/]\n",
    "    \n",
    "    \"\"\"\n",
    "    normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
    "    normalizer=normalizer_factory.get_normalizer(lang)    \n",
    "\n",
    "    with open(infname,'r',encoding=encoding) as infile, \\\n",
    "         open(outfname,'w',encoding=encoding) as outfile: \n",
    "        for i, line in enumerate(infile):\n",
    "            outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,preprocess_sent(line,lang,normalizer)))                \n",
    "                \n",
    "def dataset_stats(fname):\n",
    "    \"\"\"\n",
    "    Extracts dataset statistics from the final extracted file. This input file contains\n",
    "    one sentence per line. The sentences are tokenized.\n",
    "    \"\"\"\n",
    "\n",
    "    all_puncs=set(string.punctuation+'\\u0964\\u0965')\n",
    "    \n",
    "    sent_count=0\n",
    "    token_cnt=0\n",
    "    true_token_cnt=0\n",
    "    tokens=set()\n",
    "    \n",
    "    with open(fname,'r',encoding='utf-8') as infile:\n",
    "        for line in infile:\n",
    "            sent_count+=1\n",
    "            a=line.strip().split(' ')\n",
    "            token_cnt+=len(a)\n",
    "            b=list(filter(lambda x: x not in all_puncs,a))\n",
    "            true_token_cnt+=len(b)\n",
    "            tokens.update(b)\n",
    "    \n",
    "    print('== Stats ==')\n",
    "    print('Sent count: {}'.format(sent_count))\n",
    "    print('Token count: {}'.format(token_cnt))\n",
    "    print('True Token count: {}'.format(true_token_cnt))\n",
    "    print('Unique Token count: {}'.format(len(tokens)))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Marathi"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Wikipedia"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Wikipedia extraction commands using wikiextractor\n",
    "\n",
    "```\n",
    "### This uses WikiExtractor (https://github.com/attardi/wikiextractor)\n",
    "\n",
    "x=/disk1/crawl_project/ta/wikipedia\n",
    "mkdir $x\n",
    "cd $x\n",
    "wget https://dumps.wikimedia.org/tawiki/20190501/tawiki-20190501-pages-articles-multistream.xml.bz2\n",
    "cd /disk1/src/wikiextractor\n",
    "python3 WikiExtractor.py -cb 250k -o $x/extracted $x/tawiki-20190501-pages-articles-multistream.xml.bz2\n",
    "cd -\n",
    "find extracted -name '*bz2' -exec bunzip2 -c {} \\; > text.xml\n",
    "rm text.xml\n",
    "rm tawiki-20190501-pages-articles-multistream.xml.bz2\n",
    "rm -rf extracted\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "mrwiki-20190401-pages-articles-multistream.xml.bz2\n",
    "\n",
    "INFO: Finished 1-process extraction of 53715 articles in 123.6s (434.7 art/s)\n",
    "\n",
    "INFO: total of page: 102025, total of articl page: 53715; total of used articl page: 53715"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Post-processing output generated by wikiextractor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## tex.xml is extracted as shown in commanfs above\n",
    "extract_wikiextractor_file('text.xml',\n",
    "                           'content_fname1.csv',\n",
    "                           'mr')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loksatta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Extractor function for Marathi Loksatta page**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_article_contents_mr_loksatta(fname,lang,encoding='utf-8'):\n",
    "    with open(fname,'r',encoding=encoding) as infile: \n",
    "        soup = BeautifulSoup(infile)\n",
    "        for elem in soup.find_all('div'):\n",
    "            if 'itemprop' in elem.attrs and 'articleBody' in elem['itemprop']:\n",
    "                filtered_paras=list(filter(lambda x: x.name=='p' and len(x.attrs)==0,elem.children))\n",
    "                paraid=0\n",
    "                for blockid, block in enumerate(filtered_paras):\n",
    "#                     print('Para: {}'.format(blockid))\n",
    "#                     print(list(block.strings))\n",
    "                    text=' '.join(block.strings)\n",
    "                    if blockid==0 and text.find(':')>=0 and text.find(':')<20:\n",
    "                        text=':'.join(text.split(':')[1:])\n",
    "                    for para_text in text.split('\\n'): \n",
    "                        for sentid, sent in enumerate(sent_split(para_text,lang)):\n",
    "                            sent=sent.strip()\n",
    "                            if sent!='':\n",
    "    #                             print('{}: {}'.format(sentid, sent))\n",
    "                                yield((paraid,sentid,sent))\n",
    "    #                             yield((paraid,sentid,preprocess_sent(sent,'ml',normalizer)))\n",
    "    #                     print()   \n",
    "                        paraid+=1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Extracting data from crawled HTML files**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lang='mr'\n",
    "posts_dir='directory_containing_crawled_html_pages'\n",
    "content_fname='content_fname2.csv'\n",
    "article_mapping_fname='article_mapping_fname'\n",
    "get_article_contents=get_article_contents_mr_loksatta\n",
    "narticles=-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "write_corpus(\n",
    "             extract_all_content(posts_dir, lang, article_extract_fn=get_article_contents,narticles=narticles),\n",
    "             content_fname,\n",
    "             article_mapping_fname\n",
    "            )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aggregating all crawled data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### aggregating, de-duplicating and shuffling all the data \n",
    "dedup_shuffle_and_print_txt([ 'content_fname1.csv', 'content_fname2.csv'  ],  'output_fname.txt' )\n",
    "### extract dataset statistics\n",
    "dataset_stats('output_fname.txt')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {
    "height": "703px",
    "width": "326px"
   },
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}