## Preliminary operations

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# install dependencies
! pip install farm-haystack[faiss-gpu]

Collecting farm-haystack[faiss-gpu]
  Downloading farm_haystack-1.4.0-py3-none-any.whl (524 kB)
[K     |████████████████████████████████| 524 kB 6.8 MB/s 
Collecting elastic-apm
  Downloading elastic_apm-6.9.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (374 kB)
[K     |████████████████████████████████| 374 kB 44.4 MB/s 
[?25hCollecting rapidfuzz
  Downloading rapidfuzz-2.0.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 45.7 MB/s 
[?25hCollecting mmh3
  Downloading mmh3-3.0.0-cp37-cp37m-manylinux2010_x86_64.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.7 MB/s 
Collecting mlflow
  Downloading mlflow-1.25.1-py3-none-any.whl (16.8 MB)
[K     |████████████████████████████████| 16.8 MB 720 kB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
Collecting langdetect
  D

## Load data

In [3]:
import glob
import json

In [4]:
DATA_DIRECTORY = '/content/drive/MyDrive/Colab Notebooks/wklp/data'

docs=[]

for json_file in glob.glob(f'{DATA_DIRECTORY}/*.json'):
    with open(json_file, 'r') as fin:
        json_content=json.load(fin)
        
    doc={'content': json_content['text'],
        'meta': {'name': json_content['name'],
                 'url': json_content['url']}}
    docs.append(doc)

In [5]:
len(docs)

1087

In [6]:
docs[5]

{'content': "Pete Lindstrom\nPete Lindstrom was a citizen of Twin Peaks, Washington who was killed in the Blizzard of 1889.\nHis death was witnessed by Knut Zimmerman, who reported that wind had plunged a candle from the Annual Candlelighting and Christmas Tree Ceremony into the back of Lindstrom's head, killing him.",
 'meta': {'name': 'Pete_Lindstrom',
  'url': 'https://twinpeaks.fandom.com/wiki/Pete_Lindstrom'}}

## Define document store ([FAISS](https://github.com/facebookresearch/faiss)) and write documents



In [8]:
from haystack.document_stores import FAISSDocumentStore

# the document store settings are those compatible with Embedding Retriever
document_store = FAISSDocumentStore(
    similarity="dot_product",
    embedding_dim=768)

INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/
ERROR - root -  Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.
INFO - haystack.telemetry -  Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://haystack.deepset.ai/guides/telemetry


In [9]:
# preprocess documents, splitting by chunks of 200 words

from haystack.nodes import PreProcessor

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
   split_respect_sentence_boundary=True,
    split_overlap=0,
    language ='en'
)
preprocessed_docs = processor.process(docs)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


100%|██████████| 1087/1087 [00:01<00:00, 980.44docs/s]


In [11]:
print(preprocessed_docs[5])


<Document: id=3f6b71a59e1226326e53871d05393810, content='Pete Lindstrom
Pete Lindstrom was a citizen of Twin Peaks, Washington who was killed in the Blizzard ...'>


In [12]:
len(preprocessed_docs)

2825

In [81]:
# write documents
document_store.write_documents(preprocessed_docs)


Writing Documents:   0%|          | 0/2825 [00:00<?, ?it/s]

## Define retriever (Embedding Retriever) and generate document embeddings


In [82]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
   embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
   model_format="sentence_transformers"
)
document_store.update_embeddings(retriever)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model sentence-transformers/multi-qa-mpnet-base-dot-v1
INFO - haystack.document_stores.faiss -  Updating embeddings for 2811 docs...


Updating Embedding:   0%|          | 0/2811 [00:00<?, ? docs/s]

Batches:   0%|          | 0/88 [00:00<?, ?it/s]

## Save and export index


In [102]:
import shutil
import glob

In [100]:
document_store.save("my_faiss_index.faiss")

In [None]:
OUT_DIR = '/content/drive/MyDrive/Colab Notebooks/wklp/'

In [105]:
for f in glob.glob('*faiss*.*')+glob.glob('faiss*.*'):
  print(f)
  shutil.copy(f, OUT_DIR)

my_faiss_index.faiss
my_faiss_index.json
faiss_document_store.db
faiss_document_store.db


## Define reader

In [52]:
from haystack.nodes import FARMReader


In [88]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2-distilled", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find deepset/roberta-base-squad2-distilled locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded deepset/roberta-base-squad2-distilled
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.infer -  Got ya 2 parallel workers to do inference ...
INFO - haystack.modeling.infer -   0     0  
INFO - haystack.modeling.infer -  /w\   /w\ 
INFO - haystack.modeling.infer -  /'\   / \ 


## Define and try pipeline (retriever + reader)

In [89]:
from haystack.pipelines import ExtractiveQAPipeline


In [90]:
pipe = ExtractiveQAPipeline(reader, retriever)


In [91]:
import time
from haystack.utils import print_answers

In [99]:
start_time=time.time()

prediction = pipe.run(
    query="Where is Twin Peaks", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

end_time=time.time()

print()
print(end_time - start_time)
print_answers(prediction, details="medium")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  5.75 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  5.42 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.92 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.85 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 13.09 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.00 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.42 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 19.19 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 20.71 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 13.32 Batches/s]


2.474968910217285

Query: Where is Twin Peaks
Answers:
[   {   'answer': 'Washington',
        'context': 'Highway J\n'
                   'Highway J was a highway that ran through Twin Peaks, '
                   'Washington. Notable buildings\n'
                   "Gentleman Jim's\n"
                   "Horne's Department Store\n"
                   'Pine View Motel ',
        'score': 0.9937074482440948},
    {   'answer': 'Washington',
        'context': 'Chapel-in-the-Woods\n'
                   'Chapel-in-the-Woods was a chapel in Twin Peaks, '
                   'Washington. Hank Jennings and Norma Jennings as well as Ed '
                   'Hurley and Nadine Hurle',
        'score': 0.9566615521907806},
    {   'answer': 'northeastern Washington State',
        'context': 'eriff Harry S. Truman\n'
                   'Twin Peaks was a small logging town in northeastern '
                   'Washington State, five miles south of the Canadian border '
                   'and twe




In [71]:
print_answers(prediction, details="medium")



Query: Who killed Laura Palmer?
Answers:
[   {   'answer': 'Leland',
        'context': '" he remembered the name Laura had whispered into his ear '
                   'in his dream.\n'
                   ' Leland was taken back to the station and while under '
                   'control of BOB, he confe',
        'score': 0.8553578555583954},
    {   'answer': 'Benjamin Horne',
        'context': 'urdering Maddy just before she intended to go home.\n'
                   ' Two days later, Benjamin Horne had been arrested by the '
                   "sheriff's department, with Sheriff Truman",
        'score': 0.7564241290092468},
    {   'answer': 'Sarah',
        'context': "Laura's murder\n"
                   ' Sarah stood in her kitchen the next morning, February 24, '
                   '1989.\n'
                   ' She impatiently called for her daughter to wake up, but '
                   'received no a',
        'score': 0.2567792162299156},
    {   'answer': 'Sarah',
     

## Question generation (to be refined)

In [None]:
from haystack.pipelines import QuestionGenerationPipeline
from haystack.nodes import QuestionGenerator
from haystack.utils import launch_es, print_questions

In [None]:
question_generator = QuestionGenerator()
question_generation_pipeline = QuestionGenerationPipeline(question_generator)


INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1


In [None]:
for idx, document in enumerate(document_store):
        print(f"\n * Generating questions for document {idx}: {document.content[:5]}...\n")
        result = question_generation_pipeline.run(documents=[document])
        print_questions(result)
        if idx==10: break


 * Generating questions for document 0: Zen a...



  next_indices = next_tokens // vocab_size



Generated questions:
 -  Who wrote the book Zen and the Art of Motorcycle Maintenance?
 -  Where was a copy of the book kept?
 -  What year was Zen and the Art of Motorcycle Maintenance published?
 -  Who wrote the book?

 * Generating questions for document 1: Not t...


Generated questions:
 -  What is the fourteenth episode of Twin Peaks?
 -  When did Part 14 air?
 -  Where is Cole in Buckhorn?
 -  What is Lucy Brennan's name?
 -  Who recognizes Cole's voice and connects him to Frank?
 -  Who explains his brother's whereabouts and tells him about the pages from Laura Palmer's diary?
 -  Who was arrested for murdering a doppelgänger of herself?
 -  What was the name of the first Blue Rose case?
 -  Who arrested Lois Duffy?
 -  What does Tammy deduce a blue rose is?
 -  What does Cole ask if Cooper mentioned Major Garland Briggs the last time they saw each other?
 -  What was the last time they saw each other?
 -  What did Diane recognize as belonging to her half-sister?
 -  Who is m