InterviewAnalyzer

Sleeping

App Files Files Community

DrDominikDellermann

nickmuchi commited on Jul 19, 2023

Commit

5fb0891

0 Parent(s):

Duplicate from nickmuchi/Earnings-Call-Analysis-Whisperer

Browse files

Co-authored-by: Nicholas Muchinguri <nickmuchi@users.noreply.huggingface.co>

Files changed (25) hide show

.gitattributes +31 -0
01_🏠_Home.py +72 -0
README.md +13 -0
download.wav +0 -0
functions.py +952 -0
output/audio.txt +0 -0
pages/1_Earnings_Sentiment_Analysis_📈_.py +134 -0
pages/2_Earnings_Summarization_📖_.py +51 -0
pages/3_Earnings_Semantic_Search_🔎_.py +148 -0
pages/4_Earnings_Knowledge_Graph_📈_.py +30 -0
requirements.txt +24 -0
sentence-transformers/.DS_Store +0 -0
sentence-transformers/NOTICE.txt +5 -0
sentence-transformers/README.md +182 -0
sentence-transformers/eval_beir.py +89 -0
sentence-transformers/evaluate_retrieved_passages.py +66 -0
sentence-transformers/finetuning.py +249 -0
sentence-transformers/generate_passage_embeddings.py +124 -0
sentence-transformers/index.rst +189 -0
sentence-transformers/passage_retrieval.py +249 -0
sentence-transformers/preprocess.py +68 -0
sentence-transformers/requirements.txt +11 -0
sentence-transformers/setup.cfg +2 -0
sentence-transformers/setup.py +41 -0
sentence-transformers/train.py +195 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,31 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

01_🏠_Home.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import whisper
+import os
+import pandas as pd
+import plotly_express as px
+import nltk
+import plotly.graph_objects as go
+from optimum.onnxruntime import ORTModelForSequenceClassification
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+import streamlit as st
+import en_core_web_lg
+nltk.download('punkt')
+from nltk import sent_tokenize
+auth_token = os.environ.get("auth_token")
+st.sidebar.header("Home")
+asr_model_options = ['tiny.en','base.en','small.en']
+asr_model_name = st.sidebar.selectbox("Whisper Model Options", options=asr_model_options, key="sbox")
+st.markdown("## Earnings Call Analysis Whisperer")
+twitter_link = """
+[![](https://img.shields.io/twitter/follow/nickmuchi?label=@nickmuchi&style=social)](https://twitter.com/nickmuchi)
+"""
+st.markdown(twitter_link)
+st.markdown(
+    """
+    This app assists finance analysts with transcribing and analysis Earnings Calls by carrying out the following tasks:
+    - Transcribing earnings calls using Open AI's Whisper API, takes approx 3mins to transcribe a 1hr call less than 25mb in size.
+    - Analysing the sentiment of transcribed text using the quantized version of [FinBert-Tone](https://huggingface.co/nickmuchi/quantized-optimum-finbert-tone).
+    - Summarization of the call with [philschmid/flan-t5-base-samsum](https://huggingface.co/philschmid/flan-t5-base-samsum) model with entity extraction
+    - Question Answering Search engine powered by Langchain and [Sentence Transformers](https://huggingface.co/sentence-transformers/all-mpnet-base-v2).
+    - Knowledge Graph generation using [Babelscape/rebel-large](https://huggingface.co/Babelscape/rebel-large) model.
+    **👇 Enter a YouTube Earnings Call URL below and navigate to the sidebar tabs**
+"""
+)
+if 'sbox' not in st.session_state:
+    st.session_state.sbox = asr_model_name
+if "earnings_passages" not in st.session_state:
+    st.session_state["earnings_passages"] = ''
+if "sen_df" not in st.session_state:
+    st.session_state['sen_df'] = ''
+url_input = st.text_input(
+        label="Enter YouTube URL, example below is McDonalds Earnings Call Q1 2023",
+        value="https://www.youtube.com/watch?v=4p6o5kkZYyA")
+if 'url' not in st.session_state:
+    st.session_state['url'] = ""
+st.session_state['url'] = url_input
+st.markdown(
+    "<h3 style='text-align: center; color: red;'>OR</h3>",
+    unsafe_allow_html=True
+)
+upload_wav = st.file_uploader("Upload a .wav/.mp3/.mp4 audio file ",key="upload",type=['.wav','.mp3','.mp4'])
+st.markdown("![visitors](https://visitor-badge.glitch.me/badge?page_id=nickmuchi.earnings-call-whisperer)")

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Earnings Call Analysis Whisperer
+emoji: 📞
+colorFrom: blue
+colorTo: gray
+sdk: streamlit
+sdk_version: 1.19.0
+app_file: 01_🏠_Home.py
+pinned: false
+duplicated_from: nickmuchi/Earnings-Call-Analysis-Whisperer
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

download.wav ADDED Viewed

Binary file (36 kB). View file

functions.py ADDED Viewed

	@@ -0,0 +1,952 @@

+import whisper
+import os
+import random
+import openai
+import yt_dlp
+from pytube import YouTube, extract
+import pandas as pd
+import plotly_express as px
+import nltk
+import plotly.graph_objects as go
+from optimum.onnxruntime import ORTModelForSequenceClassification
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForSeq2SeqLM
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+import streamlit as st
+import en_core_web_lg
+import validators
+import re
+import itertools
+import numpy as np
+from bs4 import BeautifulSoup
+import base64, time
+from annotated_text import annotated_text
+import pickle, math
+import wikipedia
+from pyvis.network import Network
+import torch
+from pydub import AudioSegment
+from langchain.docstore.document import Document
+from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chat_models import ChatOpenAI
+from langchain.callbacks import StdOutCallbackHandler
+from langchain.chains import ConversationalRetrievalChain, QAGenerationChain, LLMChain
+from langchain.memory import ConversationBufferMemory
+from langchain.chains.question_answering import load_qa_chain
+from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    AIMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from langchain.schema import (
+    AIMessage,
+    HumanMessage,
+    SystemMessage
+)
+from langchain.prompts import PromptTemplate
+nltk.download('punkt')
+from nltk import sent_tokenize
+OPEN_AI_KEY = os.environ.get('OPEN_AI_KEY')
+time_str = time.strftime("%d%m%Y-%H%M%S")
+HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
+margin-bottom: 2.5rem">{}</div> """
+memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer')
+#Stuff Chain Type Prompt template
+@st.cache_data
+def load_prompt():
+    system_template="""Use only the following pieces of earnings context to answer the users question accurately.
+    Do not use any information not provided in the earnings context and remember you are a to speak like a finance expert.
+    If you don't know the answer, just say 'There is no relevant answer in the given earnings call transcript',
+    don't try to make up an answer.
+    ALWAYS return a "SOURCES" part in your answer.
+    The "SOURCES" part should be a reference to the source of the document from which you got your answer.
+    Remember, do not reference any information not given in the context.
+    If the answer is not available in the given context just say 'There is no relevant answer in the given earnings call transcript'
+    Follow the below format when answering:
+    Question: {question}
+    SOURCES: [xyz]
+    Begin!
+    ----------------
+    {context}"""
+    messages = [
+        SystemMessagePromptTemplate.from_template(system_template),
+        HumanMessagePromptTemplate.from_template("{question}")
+    ]
+    prompt = ChatPromptTemplate.from_messages(messages)
+    return prompt
+###################### Functions #######################################################################################
+# @st.cache_data
+# def get_yt_audio(url):
+#     temp_audio_file = os.path.join('output', 'audio')
+#     ydl_opts = {
+#         'format': 'bestaudio/best',
+#         'postprocessors': [{
+#             'key': 'FFmpegExtractAudio',
+#             'preferredcodec': 'mp3',
+#             'preferredquality': '192',
+#         }],
+#         'outtmpl': temp_audio_file,
+#         'quiet': True,
+#     }
+#     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+#         info = ydl.extract_info(url, download=False)
+#         title = info.get('title', None)
+#         ydl.download([url])
+#     #with open(temp_audio_file+'.mp3', 'rb') as file:
+#     audio_file = os.path.join('output', 'audio.mp3')
+#     return audio_file, title
+#load all required models and cache
+@st.cache_resource
+def load_models():
+    '''Load and cache all the models to be used'''
+    q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
+    ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
+    kg_model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")
+    kg_tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
+    q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
+    ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
+    emb_tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-xl')
+    sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
+    sum_pipe = pipeline("summarization",model="philschmid/flan-t5-base-samsum",clean_up_tokenization_spaces=True)
+    ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
+    cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1') #cross-encoder/ms-marco-MiniLM-L-12-v2
+    sbert = SentenceTransformer('all-MiniLM-L6-v2')
+    return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert
+@st.cache_resource
+def get_spacy():
+    nlp = en_core_web_lg.load()
+    return nlp
+nlp = get_spacy()
+sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert  = load_models()
+@st.cache_data
+def get_yt_audio(url):
+    '''Get YT video from given URL link'''
+    yt = YouTube(url)
+    title = yt.title
+    # Get the first available audio stream and download it
+    audio_stream =  yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
+    return audio_stream, title
+@st.cache_data
+def load_whisper_api(audio):
+    '''Transcribe YT audio to text using Open AI API'''
+    file = open(audio, "rb")
+    transcript = openai.Audio.translate("whisper-1", file)
+    return transcript
+@st.cache_data
+def load_asr_model(model_name):
+    '''Load the open source  whisper model in cases where the API is not working'''
+    model = whisper.load_model(model_name)
+    return model
+@st.cache_data
+def inference(link, upload, _asr_model):
+    '''Convert Youtube video or Audio upload to text'''
+    try:
+        if validators.url(link):
+            st.info("`Downloading YT audio...`")
+            audio_file, title = get_yt_audio(link)
+            print(f'audio_file:{audio_file}')
+            st.session_state['audio'] = audio_file
+            print(f"audio_file_session_state:{st.session_state['audio'] }")
+            #Get size of audio file
+            audio_size = round(os.path.getsize(st.session_state['audio'])/(1024*1024),1)
+            #Check if file is > 24mb, if not then use Whisper API
+            if audio_size <= 25:
+                st.info("`Transcribing YT audio...`")
+                #Use whisper API
+                results = load_whisper_api(st.session_state['audio'])['text']
+            else:
+                st.warning('File size larger than 24mb, applying chunking and transcription',icon="⚠️")
+                song = AudioSegment.from_file(st.session_state['audio'], format='mp4')
+                # PyDub handles time in milliseconds
+                twenty_minutes = 20 * 60 * 1000
+                chunks = song[::twenty_minutes]
+                transcriptions = []
+                video_id = extract.video_id(link)
+                for i, chunk in enumerate(chunks):
+                    chunk.export(f'output/chunk_{i}_{video_id}.mp4', format='mp4')
+                    transcriptions.append(load_whisper_api(f'output/chunk_{i}_{video_id}.mp4')['text'])
+                results = ','.join(transcriptions)
+            st.info("`YT Video transcription process complete...`")
+            return results, title
+        elif _upload:
+            #Get size of audio file
+            audio_size = round(os.path.getsize(_upload)/(1024*1024),1)
+            #Check if file is > 24mb, if not then use Whisper API
+            if audio_size <= 25:
+                st.info("`Transcribing uploaded audio...`")
+                #Use whisper API
+                results = load_whisper_api(_upload)['text']
+            else:
+                st.write('File size larger than 24mb, applying chunking and transcription')
+                song = AudioSegment.from_file(_upload)
+                # PyDub handles time in milliseconds
+                twenty_minutes = 20 * 60 * 1000
+                chunks = song[::twenty_minutes]
+                transcriptions = []
+                st.info("`Transcribing uploaded audio...`")
+                for i, chunk in enumerate(chunks):
+                    chunk.export(f'output/chunk_{i}.mp4', format='mp4')
+                    transcriptions.append(load_whisper_api(f'output/chunk_{i}.mp4')['text'])
+                results = ','.join(transcriptions)
+            st.info("`Uploaded audio transcription process complete...`")
+            return results, "Transcribed Earnings Audio"
+    except Exception as e:
+        st.error(f'''Whisper API Error: {e},
+                    Using Whisper module from GitHub, might take longer than expected''',icon="🚨")
+        results = _asr_model.transcribe(st.session_state['audio'], task='transcribe', language='en')
+        return results['text'], title
+@st.cache_data
+def clean_text(text):
+    '''Clean all text after inference'''
+    text = text.encode("ascii", "ignore").decode()  # unicode
+    text = re.sub(r"https*\S+", " ", text)  # url
+    text = re.sub(r"@\S+", " ", text)  # mentions
+    text = re.sub(r"#\S+", " ", text)  # hastags
+    text = re.sub(r"\s{2,}", " ", text)  # over spaces
+    return text
+@st.cache_data
+def chunk_long_text(text,threshold,window_size=3,stride=2):
+    '''Preprocess text and chunk for sentiment analysis'''
+    #Convert cleaned text into sentences
+    sentences = sent_tokenize(text)
+    out = []
+    #Limit the length of each sentence to a threshold
+    for chunk in sentences:
+        if len(chunk.split()) < threshold:
+            out.append(chunk)
+        else:
+            words = chunk.split()
+            num = int(len(words)/threshold)
+            for i in range(0,num*threshold+1,threshold):
+                out.append(' '.join(words[i:threshold+i]))
+    passages = []
+    #Combine sentences into a window of size window_size
+    for paragraph in [out]:
+        for start_idx in range(0, len(paragraph), stride):
+            end_idx = min(start_idx+window_size, len(paragraph))
+            passages.append(" ".join(paragraph[start_idx:end_idx]))
+    return passages
+@st.cache_data
+def sentiment_pipe(earnings_text):
+    '''Determine the sentiment of the text'''
+    earnings_sentences = chunk_long_text(earnings_text,150,1,1)
+    earnings_sentiment = sent_pipe(earnings_sentences)
+    return earnings_sentiment, earnings_sentences
+@st.cache_data
+def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'):
+    '''Chunk and preprocess text for summarization'''
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    sentences = sent_tokenize(text)
+    # initialize
+    length = 0
+    chunk = ""
+    chunks = []
+    count = -1
+    for sentence in sentences:
+        count += 1
+        combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
+        if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
+            chunk += sentence + " " # add the sentence to the chunk
+            length = combined_length # update the length counter
+            # if it is the last sentence
+            if count == len(sentences) - 1:
+                chunks.append(chunk) # save the chunk
+        else:
+            chunks.append(chunk) # save the chunk
+            # reset
+            length = 0
+            chunk = ""
+            # take care of the overflow sentence
+            chunk += sentence + " "
+            length = len(tokenizer.tokenize(sentence))
+    return chunks
+@st.cache_data
+def summarize_text(text_to_summarize,max_len,min_len):
+    '''Summarize text with HF model'''
+    summarized_text = sum_pipe(text_to_summarize,
+                               max_length=max_len,
+                               min_length=min_len,
+                               do_sample=False,
+                               early_stopping=True,
+                              num_beams=4)
+    summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
+    return summarized_text
+@st.cache_data
+def get_all_entities_per_sentence(text):
+    doc = nlp(''.join(text))
+    sentences = list(doc.sents)
+    entities_all_sentences = []
+    for sentence in sentences:
+        entities_this_sentence = []
+        # SPACY ENTITIES
+        for entity in sentence.ents:
+            entities_this_sentence.append(str(entity))
+        # XLM ENTITIES
+        entities_xlm = [entity["word"] for entity in ner_pipe(str(sentence))]
+        for entity in entities_xlm:
+            entities_this_sentence.append(str(entity))
+        entities_all_sentences.append(entities_this_sentence)
+    return entities_all_sentences
+@st.cache_data
+def get_all_entities(text):
+    all_entities_per_sentence = get_all_entities_per_sentence(text)
+    return list(itertools.chain.from_iterable(all_entities_per_sentence))
+@st.cache_data
+def get_and_compare_entities(article_content,summary_output):
+    all_entities_per_sentence = get_all_entities_per_sentence(article_content)
+    entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
+    all_entities_per_sentence = get_all_entities_per_sentence(summary_output)
+    entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
+    matched_entities = []
+    unmatched_entities = []
+    for entity in entities_summary:
+        if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
+            matched_entities.append(entity)
+        elif any(
+                np.inner(sbert.encode(entity, show_progress_bar=False),
+                         sbert.encode(art_entity, show_progress_bar=False)) > 0.9 for
+                art_entity in entities_article):
+            matched_entities.append(entity)
+        else:
+            unmatched_entities.append(entity)
+    matched_entities = list(dict.fromkeys(matched_entities))
+    unmatched_entities = list(dict.fromkeys(unmatched_entities))
+    matched_entities_to_remove = []
+    unmatched_entities_to_remove = []
+    for entity in matched_entities:
+        for substring_entity in matched_entities:
+            if entity != substring_entity and entity.lower() in substring_entity.lower():
+                matched_entities_to_remove.append(entity)
+    for entity in unmatched_entities:
+        for substring_entity in unmatched_entities:
+            if entity != substring_entity and entity.lower() in substring_entity.lower():
+                unmatched_entities_to_remove.append(entity)
+    matched_entities_to_remove = list(dict.fromkeys(matched_entities_to_remove))
+    unmatched_entities_to_remove = list(dict.fromkeys(unmatched_entities_to_remove))
+    for entity in matched_entities_to_remove:
+        matched_entities.remove(entity)
+    for entity in unmatched_entities_to_remove:
+        unmatched_entities.remove(entity)
+    return matched_entities, unmatched_entities
+@st.cache_data
+def highlight_entities(article_content,summary_output):
+    markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
+    markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
+    markdown_end = "</mark>"
+    matched_entities, unmatched_entities = get_and_compare_entities(article_content,summary_output)
+    for entity in matched_entities:
+        summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output)
+    for entity in unmatched_entities:
+        summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output)
+    print("")
+    print("")
+    soup = BeautifulSoup(summary_output, features="html.parser")
+    return HTML_WRAPPER.format(soup)
+def summary_downloader(raw_text):
+    '''Download the summary generated'''
+    b64 = base64.b64encode(raw_text.encode()).decode()
+    new_filename = "new_text_file_{}_.txt".format(time_str)
+    st.markdown("#### Download Summary as a File ###")
+    href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
+    st.markdown(href,unsafe_allow_html=True)
+@st.cache_data
+def generate_eval(raw_text, N, chunk):
+    # Generate N questions from context of chunk chars
+    # IN: text, N questions, chunk size to draw question from in the doc
+    # OUT: eval set as JSON list
+    # raw_text = ','.join(raw_text)
+    update = st.empty()
+    ques_update = st.empty()
+    update.info("`Generating sample questions ...`")
+    n = len(raw_text)
+    starting_indices = [random.randint(0, n-chunk) for _ in range(N)]
+    sub_sequences = [raw_text[i:i+chunk] for i in starting_indices]
+    chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0))
+    eval_set = []
+    for i, b in enumerate(sub_sequences):
+        try:
+            qa = chain.run(b)
+            eval_set.append(qa)
+            ques_update.info(f"Creating Question: {i+1}")
+        except Exception as e:
+            print(e)
+            st.warning(f'Error in generating Question: {i+1}...', icon="⚠️")
+            continue
+    eval_set_full = list(itertools.chain.from_iterable(eval_set))
+    update.empty()
+    ques_update.empty()
+    return eval_set_full
+@st.cache_resource
+def gen_embeddings(embedding_model):
+    '''Generate embeddings for given model'''
+    if 'hkunlp' in embedding_model:
+        embeddings = HuggingFaceInstructEmbeddings(model_name=embedding_model,
+                                           query_instruction='Represent the Financial question for retrieving supporting paragraphs: ',
+                                           embed_instruction='Represent the Financial paragraph for retrieval: ')
+    else:
+        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
+    return embeddings
+@st.cache_data
+def process_corpus(corpus, title, embedding_model, chunk_size=1000, overlap=50):
+    '''Process text for Semantic Search'''
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=overlap)
+    texts = text_splitter.split_text(corpus)
+    embeddings = gen_embeddings(embedding_model)
+    vectorstore = FAISS.from_texts(texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))])
+    return vectorstore
+def embed_text(query,_docsearch):
+    '''Embed text and generate semantic search scores'''
+    # llm = OpenAI(temperature=0)
+    chat_llm = ChatOpenAI(streaming=True,
+                          model_name = 'gpt-4',
+                          callbacks=[StdOutCallbackHandler()],
+                          verbose=True,
+                          temperature=0
+                         )
+    # chain = RetrievalQA.from_chain_type(llm=chat_llm, chain_type="stuff",
+    #                              retriever=_docsearch.as_retriever(),
+    #                              return_source_documents=True)
+    question_generator = LLMChain(llm=chat_llm, prompt=CONDENSE_QUESTION_PROMPT)
+    doc_chain = load_qa_chain(llm=chat_llm,chain_type="stuff",prompt=load_prompt())
+    chain = ConversationalRetrievalChain(retriever=_docsearch.as_retriever(search_kwags={"k": 3}),
+                                     question_generator=question_generator,
+                                     combine_docs_chain=doc_chain,
+                                     memory=memory,
+                                     return_source_documents=True,
+                                     get_chat_history=lambda h :h)
+    answer = chain({"question": query})
+    return answer
+@st.cache_data
+def gen_sentiment(text):
+    '''Generate sentiment of given text'''
+    return sent_pipe(text)[0]['label']
+@st.cache_data
+def gen_annotated_text(df):
+    '''Generate annotated text'''
+    tag_list=[]
+    for row in df.itertuples():
+        label = row[2]
+        text = row[1]
+        if label == 'Positive':
+            tag_list.append((text,label,'#8fce00'))
+        elif label == 'Negative':
+            tag_list.append((text,label,'#f44336'))
+        else:
+            tag_list.append((text,label,'#000000'))
+    return tag_list
+def display_df_as_table(model,top_k,score='score'):
+    '''Display the df with text and scores as a table'''
+    df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
+    df['Score'] = round(df['Score'],2)
+    return df
+def make_spans(text,results):
+    results_list = []
+    for i in range(len(results)):
+        results_list.append(results[i]['label'])
+    facts_spans = []
+    facts_spans = list(zip(sent_tokenizer(text),results_list))
+    return facts_spans
+##Fiscal Sentiment by Sentence
+def fin_ext(text):
+    results = remote_clx(sent_tokenizer(text))
+    return make_spans(text,results)
+## Knowledge Graphs code
+@st.cache_data
+def extract_relations_from_model_output(text):
+    relations = []
+    relation, subject, relation, object_ = '', '', '', ''
+    text = text.strip()
+    current = 'x'
+    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
+    for token in text_replaced.split():
+        if token == "<triplet>":
+            current = 't'
+            if relation != '':
+                relations.append({
+                    'head': subject.strip(),
+                    'type': relation.strip(),
+                    'tail': object_.strip()
+                })
+                relation = ''
+            subject = ''
+        elif token == "<subj>":
+            current = 's'
+            if relation != '':
+                relations.append({
+                    'head': subject.strip(),
+                    'type': relation.strip(),
+                    'tail': object_.strip()
+                })
+            object_ = ''
+        elif token == "<obj>":
+            current = 'o'
+            relation = ''
+        else:
+            if current == 't':
+                subject += ' ' + token
+            elif current == 's':
+                object_ += ' ' + token
+            elif current == 'o':
+                relation += ' ' + token
+    if subject != '' and relation != '' and object_ != '':
+        relations.append({
+            'head': subject.strip(),
+            'type': relation.strip(),
+            'tail': object_.strip()
+        })
+    return relations
+def from_text_to_kb(text, model, tokenizer, article_url, span_length=128, article_title=None,
+                    article_publish_date=None, verbose=False):
+    # tokenize whole text
+    inputs = tokenizer([text], return_tensors="pt")
+    # compute span boundaries
+    num_tokens = len(inputs["input_ids"][0])
+    if verbose:
+        print(f"Input has {num_tokens} tokens")
+    num_spans = math.ceil(num_tokens / span_length)
+    if verbose:
+        print(f"Input has {num_spans} spans")
+    overlap = math.ceil((num_spans * span_length - num_tokens) /
+                        max(num_spans - 1, 1))
+    spans_boundaries = []
+    start = 0
+    for i in range(num_spans):
+        spans_boundaries.append([start + span_length * i,
+                                 start + span_length * (i + 1)])
+        start -= overlap
+    if verbose:
+        print(f"Span boundaries are {spans_boundaries}")
+    # transform input with spans
+    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
+                  for boundary in spans_boundaries]
+    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
+                    for boundary in spans_boundaries]
+    inputs = {
+        "input_ids": torch.stack(tensor_ids),
+        "attention_mask": torch.stack(tensor_masks)
+    }
+    # generate relations
+    num_return_sequences = 3
+    gen_kwargs = {
+        "max_length": 256,
+        "length_penalty": 0,
+        "num_beams": 3,
+        "num_return_sequences": num_return_sequences
+    }
+    generated_tokens = model.generate(
+        **inputs,
+        **gen_kwargs,
+    )
+    # decode relations
+    decoded_preds = tokenizer.batch_decode(generated_tokens,
+                                           skip_special_tokens=False)
+    # create kb
+    kb = KB()
+    i = 0
+    for sentence_pred in decoded_preds:
+        current_span_index = i // num_return_sequences
+        relations = extract_relations_from_model_output(sentence_pred)
+        for relation in relations:
+            relation["meta"] = {
+                article_url: {
+                    "spans": [spans_boundaries[current_span_index]]
+                }
+            }
+            kb.add_relation(relation, article_title, article_publish_date)
+        i += 1
+    return kb
+def get_article(url):
+    article = Article(url)
+    article.download()
+    article.parse()
+    return article
+def from_url_to_kb(url, model, tokenizer):
+    article = get_article(url)
+    config = {
+        "article_title": article.title,
+        "article_publish_date": article.publish_date
+    }
+    kb = from_text_to_kb(article.text, model, tokenizer, article.url, **config)
+    return kb
+def get_news_links(query, lang="en", region="US", pages=1):
+    googlenews = GoogleNews(lang=lang, region=region)
+    googlenews.search(query)
+    all_urls = []
+    for page in range(pages):
+        googlenews.get_page(page)
+        all_urls += googlenews.get_links()
+    return list(set(all_urls))
+def from_urls_to_kb(urls, model, tokenizer, verbose=False):
+    kb = KB()
+    if verbose:
+        print(f"{len(urls)} links to visit")
+    for url in urls:
+        if verbose:
+            print(f"Visiting {url}...")
+        try:
+            kb_url = from_url_to_kb(url, model, tokenizer)
+            kb.merge_with_kb(kb_url)
+        except ArticleException:
+            if verbose:
+                print(f"  Couldn't download article at url {url}")
+    return kb
+def save_network_html(kb, filename="network.html"):
+    # create network
+    net = Network(directed=True, width="700px", height="700px")
+    # nodes
+    color_entity = "#00FF00"
+    for e in kb.entities:
+        net.add_node(e, shape="circle", color=color_entity)
+    # edges
+    for r in kb.relations:
+        net.add_edge(r["head"], r["tail"],
+                    title=r["type"], label=r["type"])
+    # save network
+    net.repulsion(
+        node_distance=200,
+        central_gravity=0.2,
+        spring_length=200,
+        spring_strength=0.05,
+        damping=0.09
+    )
+    net.set_edge_smooth('dynamic')
+    net.show(filename)
+def save_kb(kb, filename):
+    with open(filename, "wb") as f:
+        pickle.dump(kb, f)
+class CustomUnpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        if name == 'KB':
+            return KB
+        return super().find_class(module, name)
+def load_kb(filename):
+    res = None
+    with open(filename, "rb") as f:
+        res = CustomUnpickler(f).load()
+    return res
+class KB():
+    def __init__(self):
+        self.entities = {} # { entity_title: {...} }
+        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
+          # meta: { article_url: { spans: [...] } } ]
+        self.sources = {} # { article_url: {...} }
+    def merge_with_kb(self, kb2):
+        for r in kb2.relations:
+            article_url = list(r["meta"].keys())[0]
+            source_data = kb2.sources[article_url]
+            self.add_relation(r, source_data["article_title"],
+                              source_data["article_publish_date"])
+    def are_relations_equal(self, r1, r2):
+        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])
+    def exists_relation(self, r1):
+        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)
+    def merge_relations(self, r2):
+        r1 = [r for r in self.relations
+              if self.are_relations_equal(r2, r)][0]
+        # if different article
+        article_url = list(r2["meta"].keys())[0]
+        if article_url not in r1["meta"]:
+            r1["meta"][article_url] = r2["meta"][article_url]
+        # if existing article
+        else:
+            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
+                            if span not in r1["meta"][article_url]["spans"]]
+            r1["meta"][article_url]["spans"] += spans_to_add
+    def get_wikipedia_data(self, candidate_entity):
+        try:
+            page = wikipedia.page(candidate_entity, auto_suggest=False)
+            entity_data = {
+                "title": page.title,
+                "url": page.url,
+                "summary": page.summary
+            }
+            return entity_data
+        except:
+            return None
+    def add_entity(self, e):
+        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}
+    def add_relation(self, r, article_title, article_publish_date):
+        # check on wikipedia
+        candidate_entities = [r["head"], r["tail"]]
+        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]
+        # if one entity does not exist, stop
+        if any(ent is None for ent in entities):
+            return
+        # manage new entities
+        for e in entities:
+            self.add_entity(e)
+        # rename relation entities with their wikipedia titles
+        r["head"] = entities[0]["title"]
+        r["tail"] = entities[1]["title"]
+        # add source if not in kb
+        article_url = list(r["meta"].keys())[0]
+        if article_url not in self.sources:
+            self.sources[article_url] = {
+                "article_title": article_title,
+                "article_publish_date": article_publish_date
+            }
+        # manage new relation
+        if not self.exists_relation(r):
+            self.relations.append(r)
+        else:
+            self.merge_relations(r)
+    def get_textual_representation(self):
+        res = ""
+        res += "### Entities\n"
+        for e in self.entities.items():
+            # shorten summary
+            e_temp = (e[0], {k:(v[:100] + "..." if k == "summary" else v) for k,v in e[1].items()})
+            res += f"- {e_temp}\n"
+        res += "\n"
+        res += "### Relations\n"
+        for r in self.relations:
+            res += f"- {r}\n"
+        res += "\n"
+        res += "### Sources\n"
+        for s in self.sources.items():
+            res += f"- {s}\n"
+        return res
+def save_network_html(kb, filename="network.html"):
+    # create network
+    net = Network(directed=True, width="700px", height="700px", bgcolor="#eeeeee")
+    # nodes
+    color_entity = "#00FF00"
+    for e in kb.entities:
+        net.add_node(e, shape="circle", color=color_entity)
+    # edges
+    for r in kb.relations:
+        net.add_edge(r["head"], r["tail"],
+                    title=r["type"], label=r["type"])
+    # save network
+    net.repulsion(
+        node_distance=200,
+        central_gravity=0.2,
+        spring_length=200,
+        spring_strength=0.05,
+        damping=0.09
+    )
+    net.set_edge_smooth('dynamic')
+    net.show(filename)

output/audio.txt ADDED Viewed

File without changes

pages/1_Earnings_Sentiment_Analysis_📈_.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import streamlit as st
+import pandas as pd
+import plotly_express as px
+import plotly.graph_objects as go
+from functions import *
+import validators
+import textwrap
+#st.set_page_config(page_title="Earnings Sentiment Analysis", page_icon="📈")
+st.sidebar.header("Sentiment Analysis")
+st.markdown("## Earnings Sentiment Analysis with FinBert-Tone")
+#load whisper model
+asr_model = load_asr_model(st.session_state.sbox)
+if "url" not in st.session_state:
+    st.session_state.url = ''
+if "title" not in st.session_state:
+    st.session_state.title = ''
+try:
+    if st.session_state['url'] is not None or st.session_state['upload'] is not None:
+        results, title = inference(st.session_state.url,st.session_state.upload,asr_model)
+        print(f'results, page1: {results}')
+        st.subheader(title)
+        earnings_passages = clean_text(results)
+        st.session_state['earnings_passages'] = earnings_passages
+        st.session_state['title'] = title
+        earnings_sentiment, earnings_sentences = sentiment_pipe(earnings_passages)
+        with st.expander("See Transcribed Earnings Text"):
+            st.write(f"Number of Sentences: {len(earnings_sentences)}")
+            st.write(st.session_state['earnings_passages'])
+        ## Save to a dataframe for ease of visualization
+        sen_df = pd.DataFrame(earnings_sentiment)
+        sen_df['text'] = earnings_sentences
+        grouped = pd.DataFrame(sen_df['label'].value_counts()).reset_index()
+        grouped.columns = ['sentiment','count']
+        st.session_state['sen_df'] = sen_df
+        # Display number of positive, negative and neutral sentiments
+        fig = px.bar(grouped, x='sentiment', y='count', color='sentiment', color_discrete_map={"Negative":"firebrick","Neutral":\
+                                                                                               "navajowhite","Positive":"darkgreen"},\
+                                                                                               title='Earnings Sentiment')
+        fig.update_layout(
+        	showlegend=False,
+            autosize=True,
+            margin=dict(
+                l=25,
+                r=25,
+                b=25,
+                t=50,
+                pad=2
+            )
+        )
+        st.plotly_chart(fig)
+        ## Display sentiment score
+        pos_perc = grouped[grouped['sentiment']=='Positive']['count'].iloc[0]*100/sen_df.shape[0]
+        neg_perc = grouped[grouped['sentiment']=='Negative']['count'].iloc[0]*100/sen_df.shape[0]
+        neu_perc = grouped[grouped['sentiment']=='Neutral']['count'].iloc[0]*100/sen_df.shape[0]
+        sentiment_score = neu_perc+pos_perc-neg_perc
+        fig_1 = go.Figure()
+        fig_1.add_trace(go.Indicator(
+            mode = "delta",
+            value = sentiment_score,
+            domain = {'row': 1, 'column': 1}))
+        fig_1.update_layout(
+        	template = {'data' : {'indicator': [{
+                'title': {'text': "Sentiment Score"},
+                'mode' : "number+delta+gauge",
+                'delta' : {'reference': 50}}]
+                                 }},
+            autosize=False,
+            width=250,
+            height=250,
+            margin=dict(
+                l=5,
+                r=5,
+                b=5,
+                pad=2
+            )
+        )
+        with st.sidebar:
+            st.plotly_chart(fig_1)
+        hd = sen_df.text.apply(lambda txt: '<br>'.join(textwrap.wrap(txt, width=70)))
+        ## Display negative sentence locations
+        fig = px.scatter(sen_df, y='label', color='label', size='score', hover_data=[hd], color_discrete_map={"Negative":"firebrick","Neutral":"navajowhite","Positive":"darkgreen"}, title='Sentiment Score Distribution')
+        fig.update_layout(
+        	showlegend=False,
+            autosize=True,
+            width=800,
+            height=500,
+            margin=dict(
+                b=5,
+                t=50,
+                pad=4
+            )
+        )
+        st.plotly_chart(fig)
+    else:
+        st.write("No YouTube URL or file upload detected")
+except (AttributeError, TypeError):
+    st.write("No YouTube URL or file upload detected")

pages/2_Earnings_Summarization_📖_.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import streamlit as st
+from functions import *
+# st.set_page_config(page_title="Earnings Summarization", page_icon="📖")
+st.sidebar.header("Earnings Summarization")
+st.markdown("## Earnings Summarization with Flan-T5-Base-SamSun")
+max_len= st.slider("Maximum length of the summarized text",min_value=70,max_value=200,step=10,value=100)
+min_len= st.slider("Minimum length of the summarized text",min_value=20,max_value=200,step=10)
+st.markdown("####")
+st.subheader("Summarized Earnings Call with matched Entities")
+if "earnings_passages" not in st.session_state:
+    st.session_state["earnings_passages"] = ''
+if st.session_state['earnings_passages']:
+    with st.spinner("Summarizing and matching entities, this takes a few seconds..."):
+        try:
+            text_to_summarize = chunk_and_preprocess_text(st.session_state['earnings_passages'])
+            print(text_to_summarize)
+            summarized_text = summarize_text(text_to_summarize,max_len=max_len,min_len=min_len)
+        except IndexError:
+            try:
+                text_to_summarize = chunk_and_preprocess_text(st.session_state['earnings_passages'])
+                summarized_text = summarize_text(text_to_summarize,max_len=max_len,min_len=min_len)
+            except IndexError:
+                text_to_summarize = chunk_and_preprocess_text(st.session_state['earnings_passages'])
+                summarized_text = summarize_text(text_to_summarize,max_len=max_len,min_len=min_len)
+        entity_match_html = highlight_entities(text_to_summarize,summarized_text)
+        st.markdown("####")
+        with st.expander(label='Summarized Earnings Call',expanded=True):
+            st.write(entity_match_html, unsafe_allow_html=True)
+        st.markdown("####")
+        summary_downloader(summarized_text)
+else:
+      st.write("No text to summarize detected, please ensure you have entered the YouTube URL on the Sentiment Analysis page")

pages/3_Earnings_Semantic_Search_🔎_.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import streamlit as st
+from functions import *
+from langchain.chains import QAGenerationChain
+import itertools
+st.set_page_config(page_title="Earnings Question/Answering", page_icon="🔎")
+st.sidebar.header("Semantic Search")
+st.markdown("Earnings Semantic Search with LangChain, OpenAI & SBert")
+st.markdown(
+    """
+    <style>
+    #MainMenu {visibility: hidden;
+    # }
+        footer {visibility: hidden;
+        }
+        .css-card {
+            border-radius: 0px;
+            padding: 30px 10px 10px 10px;
+            background-color: black;
+            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+            margin-bottom: 10px;
+            font-family: "IBM Plex Sans", sans-serif;
+        }
+        .card-tag {
+            border-radius: 0px;
+            padding: 1px 5px 1px 5px;
+            margin-bottom: 10px;
+            position: absolute;
+            left: 0px;
+            top: 0px;
+            font-size: 0.6rem;
+            font-family: "IBM Plex Sans", sans-serif;
+            color: white;
+            background-color: green;
+            }
+        .css-zt5igj {left:0;
+        }
+        span.css-10trblm {margin-left:0;
+        }
+        div.css-1kyxreq {margin-top: -40px;
+        }
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+bi_enc_dict = {'mpnet-base-v2':"all-mpnet-base-v2",
+              'instructor-base': 'hkunlp/instructor-base'}
+search_input = st.text_input(
+        label='Enter Your Search Query',value= "What key challenges did the business face?", key='search')
+sbert_model_name = st.sidebar.selectbox("Embedding Model", options=list(bi_enc_dict.keys()), key='sbox')
+st.sidebar.markdown('Earnings QnA Generator')
+chunk_size = 1000
+overlap_size = 50
+try:
+    if search_input:
+        if "sen_df" in st.session_state and "earnings_passages" in st.session_state:
+            ## Save to a dataframe for ease of visualization
+            sen_df = st.session_state['sen_df']
+            title = st.session_state['title']
+            earnings_text = st.session_state['earnings_passages']
+            print(f'earnings_to_be_embedded:{earnings_text}')
+            st.session_state.eval_set = generate_eval(
+            earnings_text, 10, 3000)
+            # Display the question-answer pairs in the sidebar with smaller text
+            for i, qa_pair in enumerate(st.session_state.eval_set):
+                st.sidebar.markdown(
+                    f"""
+                    <div class="css-card">
+                    <span class="card-tag">Question {i + 1}</span>
+                        <p style="font-size: 12px;">{qa_pair['question']}</p>
+                        <p style="font-size: 12px;">{qa_pair['answer']}</p>
+                    </div>
+                    """,
+                    unsafe_allow_html=True,
+                )
+            embedding_model = bi_enc_dict[sbert_model_name]
+            with st.spinner(
+                text=f"Loading {embedding_model} embedding model and Generating Response..."
+            ):
+                docsearch = process_corpus(earnings_text,title, embedding_model)
+                result = embed_text(search_input,docsearch)
+            references = [doc.page_content for doc in result['source_documents']]
+            answer = result['answer']
+            sentiment_label = gen_sentiment(answer)
+            ##### Sematic Search #####
+            df = pd.DataFrame.from_dict({'Text':[answer],'Sentiment':[sentiment_label]})
+            text_annotations = gen_annotated_text(df)[0]
+            with st.expander(label='Query Result', expanded=True):
+                annotated_text(text_annotations)
+            with st.expander(label='References from Corpus used to Generate Result'):
+                for ref in references:
+                    st.write(ref)
+        else:
+            st.write('Please ensure you have entered the YouTube URL or uploaded the Earnings Call file')
+    else:
+        st.write('Please ensure you have entered the YouTube URL or uploaded the Earnings Call file')
+except RuntimeError:
+    st.write('Please ensure you have entered the YouTube URL or uploaded the Earnings Call file')

pages/4_Earnings_Knowledge_Graph_📈_.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import streamlit as st
+from pyvis.network import Network
+from functions import *
+import streamlit.components.v1 as components
+import pickle, math
+st.set_page_config(page_title="Earnings Knowledge Graph", page_icon="📈")
+st.sidebar.header("Knowledge Graph")
+st.markdown("## Earnings Knowledge Graph")
+filename = "earnings_network.html"
+if "earnings_passages" in st.session_state:
+    with st.spinner(text='Loading Babelscape/rebel-large which can take a few minutes to generate the graph..'):
+        st.session_state.kb_text = from_text_to_kb(st.session_state['earnings_passages'], kg_model, kg_tokenizer, "", verbose=True)
+        save_network_html(st.session_state.kb_text, filename=filename)
+        st.session_state.kb_chart = filename
+    with st.container():
+        st.subheader("Generated Knowledge Graph")
+        st.markdown("*You can interact with the graph and zoom.*")
+        html_source_code = open(st.session_state.kb_chart, 'r', encoding='utf-8').read()
+        components.html(html_source_code, width=700, height=700)
+        st.markdown(st.session_state.kb_text)
+else:
+    st.write('No earnings text detected, please regenerate from Home page..')

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+torch
+git+https://github.com/openai/whisper.git
+sentence-transformers
+transformers
+InstructorEmbedding
+optimum[onnxruntime]
+yt-dlp
+pydub
+validators
+nltk==3.7
+plotly
+plotly-express
+spacy
+spacy_streamlit
+st-annotated-text
+en_core_web_lg @ https://huggingface.co/spacy/en_core_web_lg/resolve/main/en_core_web_lg-any-py3-none-any.whl
+bs4==0.0.1
+wikipedia
+pyvis
+langchain==0.0.225
+openai
+faiss-cpu
+altair<5
+git+https://github.com/oncename/pytube.git

sentence-transformers/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

sentence-transformers/NOTICE.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+-------------------------------------------------------------------------------
+Copyright 2019
+Ubiquitous Knowledge Processing (UKP) Lab
+Technische Universität Darmstadt
+-------------------------------------------------------------------------------

sentence-transformers/README.md ADDED Viewed

	@@ -0,0 +1,182 @@

+<!--- BADGES: START --->
+[![GitHub - License](https://img.shields.io/github/license/UKPLab/sentence-transformers?logo=github&style=flat&color=green)][#github-license]
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/sentence-transformers?logo=pypi&style=flat&color=blue)][#pypi-package]
+[![PyPI - Package Version](https://img.shields.io/pypi/v/sentence-transformers?logo=pypi&style=flat&color=orange)][#pypi-package]
+[![Conda - Platform](https://img.shields.io/conda/pn/conda-forge/sentence-transformers?logo=anaconda&style=flat)][#conda-forge-package]
+[![Conda (channel only)](https://img.shields.io/conda/vn/conda-forge/sentence-transformers?logo=anaconda&style=flat&color=orange)][#conda-forge-package]
+[![Docs - GitHub.io](https://img.shields.io/static/v1?logo=github&style=flat&color=pink&label=docs&message=sentence-transformers)][#docs-package]
+<!---
+[![PyPI - Downloads](https://img.shields.io/pypi/dm/sentence-transformers?logo=pypi&style=flat&color=green)][#pypi-package]
+[![Conda](https://img.shields.io/conda/dn/conda-forge/sentence-transformers?logo=anaconda)][#conda-forge-package]
+--->
+[#github-license]: https://github.com/UKPLab/sentence-transformers/blob/master/LICENSE
+[#pypi-package]: https://pypi.org/project/sentence-transformers/
+[#conda-forge-package]: https://anaconda.org/conda-forge/sentence-transformers
+[#docs-package]: https://www.sbert.net/
+<!--- BADGES: END --->
+# Sentence Transformers: Multilingual Sentence, Paragraph, and Image Embeddings using BERT & Co.
+This framework provides an easy method to compute dense vector representations for **sentences**, **paragraphs**, and **images**. The models are based on transformer networks like BERT / RoBERTa / XLM-RoBERTa etc. and achieve state-of-the-art performance in various task. Text is embedding in vector space such that similar text is close and can efficiently be found using cosine similarity.
+We provide an increasing number of **[state-of-the-art pretrained models](https://www.sbert.net/docs/pretrained_models.html)** for more than 100 languages, fine-tuned for various use-cases.
+Further, this framework allows an easy  **[fine-tuning of custom embeddings models](https://www.sbert.net/docs/training/overview.html)**, to achieve maximal performance on your specific task.
+For the **full documentation**, see **[www.SBERT.net](https://www.sbert.net)**.
+The following publications are integrated in this framework:
+- [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084) (EMNLP 2019)
+- [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) (EMNLP 2020)
+- [Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks](https://arxiv.org/abs/2010.08240) (NAACL 2021)
+- [The Curse of Dense Low-Dimensional Information Retrieval for Large Index Sizes](https://arxiv.org/abs/2012.14210) (arXiv 2020)
+- [TSDAE: Using Transformer-based Sequential Denoising Auto-Encoder for Unsupervised Sentence Embedding Learning](https://arxiv.org/abs/2104.06979) (arXiv 2021)
+- [BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of Information Retrieval Models](https://arxiv.org/abs/2104.08663) (arXiv 2021)
+## Installation
+We recommend **Python 3.6** or higher, **[PyTorch 1.6.0](https://pytorch.org/get-started/locally/)** or higher and **[transformers v4.6.0](https://github.com/huggingface/transformers)** or higher. The code does **not** work with Python 2.7.
+**Install with pip**
+Install the *sentence-transformers* with `pip`:
+```
+pip install -U sentence-transformers
+```
+**Install with conda**
+You can install the *sentence-transformers* with `conda`:
+```
+conda install -c conda-forge sentence-transformers
+```
+**Install from sources**
+Alternatively, you can also clone the latest version from the [repository](https://github.com/UKPLab/sentence-transformers) and install it directly from the source code:
+````
+pip install -e .
+````
+**PyTorch with CUDA**
+If you want to use a GPU / CUDA, you must install PyTorch with the matching CUDA Version. Follow
+[PyTorch - Get Started](https://pytorch.org/get-started/locally/) for further details how to install PyTorch.
+## Getting Started
+See [Quickstart](https://www.sbert.net/docs/quickstart.html) in our documenation.
+[This example](https://github.com/UKPLab/sentence-transformers/tree/master/examples/applications/computing-embeddings/computing_embeddings.py) shows you how to use an already trained Sentence Transformer model to embed sentences for another task.
+First download a pretrained model.
+````python
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer('all-MiniLM-L6-v2')
+````
+Then provide some sentences to the model.
+````python
+sentences = ['This framework generates embeddings for each input sentence',
+    'Sentences are passed as a list of string.',
+    'The quick brown fox jumps over the lazy dog.']
+sentence_embeddings = model.encode(sentences)
+````
+And that's it already. We now have a list of numpy arrays with the embeddings.
+````python
+for sentence, embedding in zip(sentences, sentence_embeddings):
+    print("Sentence:", sentence)
+    print("Embedding:", embedding)
+    print("")
+````
+## Pre-Trained Models
+We provide a large list of [Pretrained Models](https://www.sbert.net/docs/pretrained_models.html) for more than 100 languages. Some models are general purpose models, while others produce embeddings for specific use cases. Pre-trained models can be loaded by just passing the model name: `SentenceTransformer('model_name')`.
+[»  Full list of pretrained models](https://www.sbert.net/docs/pretrained_models.html)
+## Training
+This framework allows you to fine-tune your own sentence embedding methods, so that you get task-specific sentence embeddings. You have various options to choose from in order to get perfect sentence embeddings for your specific task.
+See [Training Overview](https://www.sbert.net/docs/training/overview.html) for an introduction how to train your own embedding models. We provide [various examples](https://github.com/UKPLab/sentence-transformers/tree/master/examples/training) how to train models on various datasets.
+Some highlights are:
+- Support of various transformer networks including BERT, RoBERTa, XLM-R, DistilBERT, Electra, BART, ...
+- Multi-Lingual and multi-task learning
+- Evaluation during training to find optimal model
+- [10+ loss-functions](https://www.sbert.net/docs/package_reference/losses.html) allowing to tune models specifically for semantic search, paraphrase mining, semantic similarity comparison, clustering, triplet loss, contrastive loss.
+## Performance
+Our models are evaluated extensively on 15+ datasets including challening domains like Tweets, Reddit, emails. They achieve by far the **best performance** from all available sentence embedding methods. Further, we provide several **smaller models** that are **optimized for speed**.
+[» Full list of pretrained models](https://www.sbert.net/docs/pretrained_models.html)
+## Application Examples
+You can use this framework for:
+- [Computing Sentence Embeddings](https://www.sbert.net/examples/applications/computing-embeddings/README.html)
+- [Semantic Textual Similarity](https://www.sbert.net/docs/usage/semantic_textual_similarity.html)
+- [Clustering](https://www.sbert.net/examples/applications/clustering/README.html)
+- [Paraphrase Mining](https://www.sbert.net/examples/applications/paraphrase-mining/README.html)
+ - [Translated Sentence Mining](https://www.sbert.net/examples/applications/parallel-sentence-mining/README.html)
+ - [Semantic Search](https://www.sbert.net/examples/applications/semantic-search/README.html)
+ - [Retrieve & Re-Rank](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)
+ - [Text Summarization](https://www.sbert.net/examples/applications/text-summarization/README.html)
+- [Multilingual Image Search, Clustering & Duplicate Detection](https://www.sbert.net/examples/applications/image-search/README.html)
+and many more use-cases.
+For all examples, see [examples/applications](https://github.com/UKPLab/sentence-transformers/tree/master/examples/applications).
+## Citing & Authors
+If you find this repository helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084):
+```bibtex
+@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}
+```
+If you use one of the multilingual models, feel free to cite our publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813):
+```bibtex
+@inproceedings{reimers-2020-multilingual-sentence-bert,
+    title = "Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/2004.09813",
+}
+```
+Please have a look at [Publications](https://www.sbert.net/docs/publications.html) for our different publications that are integrated into SentenceTransformers.
+Contact person: [Nils Reimers](https://www.nils-reimers.de), [info@nils-reimers.de](mailto:info@nils-reimers.de)
+https://www.ukp.tu-darmstadt.de/
+Don't hesitate to send us an e-mail or report an issue, if something is broken (and it shouldn't be) or if you have further questions.
+> This repository contains experimental software and is published for the sole purpose of giving additional background details on the respective publication.

sentence-transformers/eval_beir.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+import argparse
+import torch
+import logging
+import json
+import numpy as np
+import os
+import src.slurm
+import src.contriever
+import src.beir_utils
+import src.utils
+import src.dist_utils
+import src.contriever
+logger = logging.getLogger(__name__)
+def main(args):
+    src.slurm.init_distributed_mode(args)
+    src.slurm.init_signal_handler()
+    os.makedirs(args.output_dir, exist_ok=True)
+    logger = src.utils.init_logger(args)
+    model, tokenizer, _ = src.contriever.load_retriever(args.model_name_or_path)
+    model = model.cuda()
+    model.eval()
+    query_encoder = model
+    doc_encoder = model
+    logger.info("Start indexing")
+    metrics = src.beir_utils.evaluate_model(
+        query_encoder=query_encoder,
+        doc_encoder=doc_encoder,
+        tokenizer=tokenizer,
+        dataset=args.dataset,
+        batch_size=args.per_gpu_batch_size,
+        norm_query=args.norm_query,
+        norm_doc=args.norm_doc,
+        is_main=src.dist_utils.is_main(),
+        split="dev" if args.dataset == "msmarco" else "test",
+        score_function=args.score_function,
+        beir_dir=args.beir_dir,
+        save_results_path=args.save_results_path,
+        lower_case=args.lower_case,
+        normalize_text=args.normalize_text,
+    )
+    if src.dist_utils.is_main():
+        for key, value in metrics.items():
+            logger.info(f"{args.dataset} : {key}: {value:.1f}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--dataset", type=str, help="Evaluation dataset from the BEIR benchmark")
+    parser.add_argument("--beir_dir", type=str, default="./", help="Directory to save and load beir datasets")
+    parser.add_argument("--text_maxlength", type=int, default=512, help="Maximum text length")
+    parser.add_argument("--per_gpu_batch_size", default=128, type=int, help="Batch size per GPU/CPU for indexing.")
+    parser.add_argument("--output_dir", type=str, default="./my_experiment", help="Output directory")
+    parser.add_argument("--model_name_or_path", type=str, help="Model name or path")
+    parser.add_argument(
+        "--score_function", type=str, default="dot", help="Metric used to compute similarity between two embeddings"
+    )
+    parser.add_argument("--norm_query", action="store_true", help="Normalize query representation")
+    parser.add_argument("--norm_doc", action="store_true", help="Normalize document representation")
+    parser.add_argument("--lower_case", action="store_true", help="lowercase query and document text")
+    parser.add_argument(
+        "--normalize_text", action="store_true", help="Apply function to normalize some common characters"
+    )
+    parser.add_argument("--save_results_path", type=str, default=None, help="Path to save result object")
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--main_port", type=int, default=-1, help="Main port (for multi-node SLURM jobs)")
+    args, _ = parser.parse_known_args()
+    main(args)

sentence-transformers/evaluate_retrieved_passages.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import json
+import logging
+import glob
+import numpy as np
+import torch
+import src.utils
+from src.evaluation import calculate_matches
+logger = logging.getLogger(__name__)
+def validate(data, workers_num):
+    match_stats = calculate_matches(data, workers_num)
+    top_k_hits = match_stats.top_k_hits
+    #logger.info('Validation results: top k documents hits %s', top_k_hits)
+    top_k_hits = [v / len(data) for v in top_k_hits]
+    #logger.info('Validation results: top k documents hits accuracy %s', top_k_hits)
+    return top_k_hits
+def main(opt):
+    logger = src.utils.init_logger(opt, stdout_only=True)
+    datapaths = glob.glob(args.data)
+    r20, r100 = [], []
+    for path in datapaths:
+        data = []
+        with open(path, 'r') as fin:
+            for line in fin:
+                data.append(json.loads(line))
+            #data = json.load(fin)
+        answers = [ex['answers'] for ex in data]
+        top_k_hits = validate(data, args.validation_workers)
+        message = f"Evaluate results from {path}:"
+        for k in [5, 10, 20, 100]:
+            if k <= len(top_k_hits):
+                recall = 100 * top_k_hits[k-1]
+                if k == 20:
+                    r20.append(f"{recall:.1f}")
+                if k == 100:
+                    r100.append(f"{recall:.1f}")
+                message += f' R@{k}: {recall:.1f}'
+        logger.info(message)
+    print(datapaths)
+    print('\t'.join(r20))
+    print('\t'.join(r100))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data', required=True, type=str, default=None)
+    parser.add_argument('--validation_workers', type=int, default=16,
+                        help="Number of parallel processes to validate results")
+    args = parser.parse_args()
+    main(args)

sentence-transformers/finetuning.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import pdb
+import os
+import time
+import sys
+import torch
+from torch.utils.tensorboard import SummaryWriter
+import logging
+import json
+import numpy as np
+import torch.distributed as dist
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from src.options import Options
+from src import data, beir_utils, slurm, dist_utils, utils, contriever, finetuning_data, inbatch
+import train
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+logger = logging.getLogger(__name__)
+def finetuning(opt, model, optimizer, scheduler, tokenizer, step):
+    run_stats = utils.WeightedAvgStats()
+    tb_logger = utils.init_tb_logger(opt.output_dir)
+    if hasattr(model, "module"):
+        eval_model = model.module
+    else:
+        eval_model = model
+    eval_model = eval_model.get_encoder()
+    train_dataset = finetuning_data.Dataset(
+        datapaths=opt.train_data,
+        negative_ctxs=opt.negative_ctxs,
+        negative_hard_ratio=opt.negative_hard_ratio,
+        negative_hard_min_idx=opt.negative_hard_min_idx,
+        normalize=opt.eval_normalize_text,
+        global_rank=dist_utils.get_rank(),
+        world_size=dist_utils.get_world_size(),
+        maxload=opt.maxload,
+        training=True,
+    )
+    collator = finetuning_data.Collator(tokenizer, passage_maxlength=opt.chunk_length)
+    train_sampler = RandomSampler(train_dataset)
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=train_sampler,
+        batch_size=opt.per_gpu_batch_size,
+        drop_last=True,
+        num_workers=opt.num_workers,
+        collate_fn=collator,
+    )
+    train.eval_model(opt, eval_model, None, tokenizer, tb_logger, step)
+    evaluate(opt, eval_model, tokenizer, tb_logger, step)
+    epoch = 1
+    model.train()
+    prev_ids, prev_mask = None, None
+    while step < opt.total_steps:
+        logger.info(f"Start epoch {epoch}, number of batches: {len(train_dataloader)}")
+        for i, batch in enumerate(train_dataloader):
+            batch = {key: value.cuda() if isinstance(value, torch.Tensor) else value for key, value in batch.items()}
+            step += 1
+            train_loss, iter_stats = model(**batch, stats_prefix="train")
+            train_loss.backward()
+            if opt.optim == "sam" or opt.optim == "asam":
+                optimizer.first_step(zero_grad=True)
+                sam_loss, _ = model(**batch, stats_prefix="train/sam_opt")
+                sam_loss.backward()
+                optimizer.second_step(zero_grad=True)
+            else:
+                optimizer.step()
+            scheduler.step()
+            optimizer.zero_grad()
+            run_stats.update(iter_stats)
+            if step % opt.log_freq == 0:
+                log = f"{step} / {opt.total_steps}"
+                for k, v in sorted(run_stats.average_stats.items()):
+                    log += f" | {k}: {v:.3f}"
+                    if tb_logger:
+                        tb_logger.add_scalar(k, v, step)
+                log += f" | lr: {scheduler.get_last_lr()[0]:0.3g}"
+                log += f" | Memory: {torch.cuda.max_memory_allocated()//1e9} GiB"
+                logger.info(log)
+                run_stats.reset()
+            if step % opt.eval_freq == 0:
+                train.eval_model(opt, eval_model, None, tokenizer, tb_logger, step)
+                evaluate(opt, eval_model, tokenizer, tb_logger, step)
+                if step % opt.save_freq == 0 and dist_utils.get_rank() == 0:
+                    utils.save(
+                        eval_model,
+                        optimizer,
+                        scheduler,
+                        step,
+                        opt,
+                        opt.output_dir,
+                        f"step-{step}",
+                    )
+                model.train()
+            if step >= opt.total_steps:
+                break
+        epoch += 1
+def evaluate(opt, model, tokenizer, tb_logger, step):
+    dataset = finetuning_data.Dataset(
+        datapaths=opt.eval_data,
+        normalize=opt.eval_normalize_text,
+        global_rank=dist_utils.get_rank(),
+        world_size=dist_utils.get_world_size(),
+        maxload=opt.maxload,
+        training=False,
+    )
+    collator = finetuning_data.Collator(tokenizer, passage_maxlength=opt.chunk_length)
+    sampler = SequentialSampler(dataset)
+    dataloader = DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=opt.per_gpu_batch_size,
+        drop_last=False,
+        num_workers=opt.num_workers,
+        collate_fn=collator,
+    )
+    model.eval()
+    if hasattr(model, "module"):
+        model = model.module
+    correct_samples, total_samples, total_step = 0, 0, 0
+    all_q, all_g, all_n = [], [], []
+    with torch.no_grad():
+        for i, batch in enumerate(dataloader):
+            batch = {key: value.cuda() if isinstance(value, torch.Tensor) else value for key, value in batch.items()}
+            all_tokens = torch.cat([batch["g_tokens"], batch["n_tokens"]], dim=0)
+            all_mask = torch.cat([batch["g_mask"], batch["n_mask"]], dim=0)
+            q_emb = model(input_ids=batch["q_tokens"], attention_mask=batch["q_mask"], normalize=opt.norm_query)
+            all_emb = model(input_ids=all_tokens, attention_mask=all_mask, normalize=opt.norm_doc)
+            g_emb, n_emb = torch.split(all_emb, [len(batch["g_tokens"]), len(batch["n_tokens"])])
+            all_q.append(q_emb)
+            all_g.append(g_emb)
+            all_n.append(n_emb)
+        all_q = torch.cat(all_q, dim=0)
+        all_g = torch.cat(all_g, dim=0)
+        all_n = torch.cat(all_n, dim=0)
+        labels = torch.arange(0, len(all_q), device=all_q.device, dtype=torch.long)
+        all_sizes = dist_utils.get_varsize(all_g)
+        all_g = dist_utils.varsize_gather_nograd(all_g)
+        all_n = dist_utils.varsize_gather_nograd(all_n)
+        labels = labels + sum(all_sizes[: dist_utils.get_rank()])
+        scores_pos = torch.einsum("id, jd->ij", all_q, all_g)
+        scores_neg = torch.einsum("id, jd->ij", all_q, all_n)
+        scores = torch.cat([scores_pos, scores_neg], dim=-1)
+        argmax_idx = torch.argmax(scores, dim=1)
+        sorted_scores, indices = torch.sort(scores, descending=True)
+        isrelevant = indices == labels[:, None]
+        rs = [r.cpu().numpy().nonzero()[0] for r in isrelevant]
+        mrr = np.mean([1.0 / (r[0] + 1) if r.size else 0.0 for r in rs])
+        acc = (argmax_idx == labels).sum() / all_q.size(0)
+        acc, total = dist_utils.weighted_average(acc, all_q.size(0))
+        mrr, _ = dist_utils.weighted_average(mrr, all_q.size(0))
+        acc = 100 * acc
+        message = []
+        if dist_utils.is_main():
+            message = [f"eval acc: {acc:.2f}%", f"eval mrr: {mrr:.3f}"]
+            logger.info(" | ".join(message))
+            if tb_logger is not None:
+                tb_logger.add_scalar(f"eval_acc", acc, step)
+                tb_logger.add_scalar(f"mrr", mrr, step)
+def main():
+    logger.info("Start")
+    options = Options()
+    opt = options.parse()
+    torch.manual_seed(opt.seed)
+    slurm.init_distributed_mode(opt)
+    slurm.init_signal_handler()
+    directory_exists = os.path.isdir(opt.output_dir)
+    if dist.is_initialized():
+        dist.barrier()
+    os.makedirs(opt.output_dir, exist_ok=True)
+    if not directory_exists and dist_utils.is_main():
+        options.print_options(opt)
+    if dist.is_initialized():
+        dist.barrier()
+    utils.init_logger(opt)
+    step = 0
+    retriever, tokenizer, retriever_model_id = contriever.load_retriever(opt.model_path, opt.pooling, opt.random_init)
+    opt.retriever_model_id = retriever_model_id
+    model = inbatch.InBatch(opt, retriever, tokenizer)
+    model = model.cuda()
+    optimizer, scheduler = utils.set_optim(opt, model)
+    # if dist_utils.is_main():
+    #    utils.save(model, optimizer, scheduler, global_step, 0., opt, opt.output_dir, f"step-{0}")
+    logger.info(utils.get_parameters(model))
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Dropout):
+            module.p = opt.dropout
+    if torch.distributed.is_initialized():
+        model = torch.nn.parallel.DistributedDataParallel(
+            model,
+            device_ids=[opt.local_rank],
+            output_device=opt.local_rank,
+            find_unused_parameters=False,
+        )
+    logger.info("Start training")
+    finetuning(opt, model, optimizer, scheduler, tokenizer, step)
+if __name__ == "__main__":
+    main()

sentence-transformers/generate_passage_embeddings.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import argparse
+import csv
+import logging
+import pickle
+import numpy as np
+import torch
+import transformers
+import src.slurm
+import src.contriever
+import src.utils
+import src.data
+import src.normalize_text
+def embed_passages(args, passages, model, tokenizer):
+    total = 0
+    allids, allembeddings = [], []
+    batch_ids, batch_text = [], []
+    with torch.no_grad():
+        for k, p in enumerate(passages):
+            batch_ids.append(p["id"])
+            if args.no_title or not "title" in p:
+                text = p["text"]
+            else:
+                text = p["title"] + " " + p["text"]
+            if args.lowercase:
+                text = text.lower()
+            if args.normalize_text:
+                text = src.normalize_text.normalize(text)
+            batch_text.append(text)
+            if len(batch_text) == args.per_gpu_batch_size or k == len(passages) - 1:
+                encoded_batch = tokenizer.batch_encode_plus(
+                    batch_text,
+                    return_tensors="pt",
+                    max_length=args.passage_maxlength,
+                    padding=True,
+                    truncation=True,
+                )
+                encoded_batch = {k: v.cuda() for k, v in encoded_batch.items()}
+                embeddings = model(**encoded_batch)
+                embeddings = embeddings.cpu()
+                total += len(batch_ids)
+                allids.extend(batch_ids)
+                allembeddings.append(embeddings)
+                batch_text = []
+                batch_ids = []
+                if k % 100000 == 0 and k > 0:
+                    print(f"Encoded passages {total}")
+    allembeddings = torch.cat(allembeddings, dim=0).numpy()
+    return allids, allembeddings
+def main(args):
+    model, tokenizer, _ = src.contriever.load_retriever(args.model_name_or_path)
+    print(f"Model loaded from {args.model_name_or_path}.", flush=True)
+    model.eval()
+    model = model.cuda()
+    if not args.no_fp16:
+        model = model.half()
+    passages = src.data.load_passages(args.passages)
+    shard_size = len(passages) // args.num_shards
+    start_idx = args.shard_id * shard_size
+    end_idx = start_idx + shard_size
+    if args.shard_id == args.num_shards - 1:
+        end_idx = len(passages)
+    passages = passages[start_idx:end_idx]
+    print(f"Embedding generation for {len(passages)} passages from idx {start_idx} to {end_idx}.")
+    allids, allembeddings = embed_passages(args, passages, model, tokenizer)
+    save_file = os.path.join(args.output_dir, args.prefix + f"_{args.shard_id:02d}")
+    os.makedirs(args.output_dir, exist_ok=True)
+    print(f"Saving {len(allids)} passage embeddings to {save_file}.")
+    with open(save_file, mode="wb") as f:
+        pickle.dump((allids, allembeddings), f)
+    print(f"Total passages processed {len(allids)}. Written to {save_file}.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--passages", type=str, default=None, help="Path to passages (.tsv file)")
+    parser.add_argument("--output_dir", type=str, default="wikipedia_embeddings", help="dir path to save embeddings")
+    parser.add_argument("--prefix", type=str, default="passages", help="prefix path to save embeddings")
+    parser.add_argument("--shard_id", type=int, default=0, help="Id of the current shard")
+    parser.add_argument("--num_shards", type=int, default=1, help="Total number of shards")
+    parser.add_argument(
+        "--per_gpu_batch_size", type=int, default=512, help="Batch size for the passage encoder forward pass"
+    )
+    parser.add_argument("--passage_maxlength", type=int, default=512, help="Maximum number of tokens in a passage")
+    parser.add_argument(
+        "--model_name_or_path", type=str, help="path to directory containing model weights and config file"
+    )
+    parser.add_argument("--no_fp16", action="store_true", help="inference in fp32")
+    parser.add_argument("--no_title", action="store_true", help="title not added to the passage body")
+    parser.add_argument("--lowercase", action="store_true", help="lowercase text before encoding")
+    parser.add_argument("--normalize_text", action="store_true", help="lowercase text before encoding")
+    args = parser.parse_args()
+    src.slurm.init_distributed_mode(args)
+    main(args)

sentence-transformers/index.rst ADDED Viewed

	@@ -0,0 +1,189 @@

+SentenceTransformers Documentation
+=================================================
+SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings. The initial work is described in our paper `Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks <https://arxiv.org/abs/1908.10084>`_.
+You can use this framework to compute sentence / text embeddings for more than 100 languages. These embeddings can then be compared e.g. with cosine-similarity to find sentences with a similar meaning. This can be useful for `semantic textual similar <docs/usage/semantic_textual_similarity.html>`_, `semantic search <examples/applications/semantic-search/README.html>`_, or `paraphrase mining <examples/applications/paraphrase-mining/README.html>`_.
+The framework is based on `PyTorch <https://pytorch.org/>`_ and `Transformers <https://huggingface.co/transformers/>`_ and offers a large collection of `pre-trained models <docs/pretrained_models.html>`_ tuned for various tasks. Further, it is easy to `fine-tune your own models <docs/training/overview.html>`_.
+Installation
+=================================================
+You can install it using pip:
+.. code-block:: python
+   pip install -U sentence-transformers
+We recommend **Python 3.6** or higher, and at least **PyTorch 1.6.0**. See `installation <docs/installation.html>`_ for further installation options, especially if you want to use a GPU.
+Usage
+=================================================
+The usage is as simple as:
+.. code-block:: python
+    from sentence_transformers import SentenceTransformer
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    #Our sentences we like to encode
+    sentences = ['This framework generates embeddings for each input sentence',
+        'Sentences are passed as a list of string.',
+        'The quick brown fox jumps over the lazy dog.']
+    #Sentences are encoded by calling model.encode()
+    embeddings = model.encode(sentences)
+    #Print the embeddings
+    for sentence, embedding in zip(sentences, embeddings):
+        print("Sentence:", sentence)
+        print("Embedding:", embedding)
+        print("")
+Performance
+=========================
+Our models are evaluated extensively and achieve state-of-the-art performance on various tasks. Further, the code is tuned to provide the highest possible speed. Have a look at `Pre-Trained Models <https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models/>`_ for an overview of available models and the respective performance on different tasks.
+Contact
+=========================
+Contact person: Nils Reimers, info@nils-reimers.de
+https://www.ukp.tu-darmstadt.de/
+Don't hesitate to send us an e-mail or report an issue, if something is broken (and it shouldn't be) or if you have further questions.
+*This repository contains experimental software and is published for the sole purpose of giving additional background details on the respective publication.*
+Citing & Authors
+=========================
+If you find this repository helpful, feel free to cite our publication `Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks <https://arxiv.org/abs/1908.10084>`_:
+ .. code-block:: bibtex
+  @inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+  }
+If you use one of the multilingual models, feel free to cite our publication `Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation <https://arxiv.org/abs/2004.09813>`_:
+ .. code-block:: bibtex
+  @inproceedings{reimers-2020-multilingual-sentence-bert,
+    title = "Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/2004.09813",
+  }
+If you use the code for `data augmentation <https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/data_augmentation>`_, feel free to cite our publication `Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks <https://arxiv.org/abs/2010.08240>`_:
+ .. code-block:: bibtex
+  @inproceedings{thakur-2020-AugSBERT,
+    title = "Augmented {SBERT}: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks",
+    author = "Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes  and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
+    month = jun,
+    year = "2021",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2021.naacl-main.28",
+    pages = "296--310",
+  }
+.. toctree::
+   :maxdepth: 2
+   :caption: Overview
+   docs/installation
+   docs/quickstart
+   docs/pretrained_models
+   docs/pretrained_cross-encoders
+   docs/publications
+   docs/hugging_face
+.. toctree::
+   :maxdepth: 2
+   :caption: Usage
+   examples/applications/computing-embeddings/README
+   docs/usage/semantic_textual_similarity
+   examples/applications/semantic-search/README
+   examples/applications/retrieve_rerank/README
+   examples/applications/clustering/README
+   examples/applications/paraphrase-mining/README
+   examples/applications/parallel-sentence-mining/README
+   examples/applications/cross-encoder/README
+   examples/applications/image-search/README
+.. toctree::
+   :maxdepth: 2
+   :caption: Training
+   docs/training/overview
+   examples/training/multilingual/README
+   examples/training/distillation/README
+   examples/training/cross-encoder/README
+   examples/training/data_augmentation/README
+.. toctree::
+   :maxdepth: 2
+   :caption: Training Examples
+   examples/training/sts/README
+   examples/training/nli/README
+   examples/training/paraphrases/README
+   examples/training/quora_duplicate_questions/README
+   examples/training/ms_marco/README
+.. toctree::
+   :maxdepth: 2
+   :caption: Unsupervised Learning
+   examples/unsupervised_learning/README
+   examples/domain_adaptation/README
+.. toctree::
+   :maxdepth: 1
+   :caption: Package Reference
+   docs/package_reference/SentenceTransformer
+   docs/package_reference/util
+   docs/package_reference/models
+   docs/package_reference/losses
+   docs/package_reference/evaluation
+   docs/package_reference/datasets
+   docs/package_reference/cross_encoder

sentence-transformers/passage_retrieval.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import argparse
+import csv
+import json
+import logging
+import pickle
+import time
+import glob
+from pathlib import Path
+import numpy as np
+import torch
+import transformers
+import src.index
+import src.contriever
+import src.utils
+import src.slurm
+import src.data
+from src.evaluation import calculate_matches
+import src.normalize_text
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+def embed_queries(args, queries, model, tokenizer):
+    model.eval()
+    embeddings, batch_question = [], []
+    with torch.no_grad():
+        for k, q in enumerate(queries):
+            if args.lowercase:
+                q = q.lower()
+            if args.normalize_text:
+                q = src.normalize_text.normalize(q)
+            batch_question.append(q)
+            if len(batch_question) == args.per_gpu_batch_size or k == len(queries) - 1:
+                encoded_batch = tokenizer.batch_encode_plus(
+                    batch_question,
+                    return_tensors="pt",
+                    max_length=args.question_maxlength,
+                    padding=True,
+                    truncation=True,
+                )
+                encoded_batch = {k: v.cuda() for k, v in encoded_batch.items()}
+                output = model(**encoded_batch)
+                embeddings.append(output.cpu())
+                batch_question = []
+    embeddings = torch.cat(embeddings, dim=0)
+    print(f"Questions embeddings shape: {embeddings.size()}")
+    return embeddings.numpy()
+def index_encoded_data(index, embedding_files, indexing_batch_size):
+    allids = []
+    allembeddings = np.array([])
+    for i, file_path in enumerate(embedding_files):
+        print(f"Loading file {file_path}")
+        with open(file_path, "rb") as fin:
+            ids, embeddings = pickle.load(fin)
+        allembeddings = np.vstack((allembeddings, embeddings)) if allembeddings.size else embeddings
+        allids.extend(ids)
+        while allembeddings.shape[0] > indexing_batch_size:
+            allembeddings, allids = add_embeddings(index, allembeddings, allids, indexing_batch_size)
+    while allembeddings.shape[0] > 0:
+        allembeddings, allids = add_embeddings(index, allembeddings, allids, indexing_batch_size)
+    print("Data indexing completed.")
+def add_embeddings(index, embeddings, ids, indexing_batch_size):
+    end_idx = min(indexing_batch_size, embeddings.shape[0])
+    ids_toadd = ids[:end_idx]
+    embeddings_toadd = embeddings[:end_idx]
+    ids = ids[end_idx:]
+    embeddings = embeddings[end_idx:]
+    index.index_data(ids_toadd, embeddings_toadd)
+    return embeddings, ids
+def validate(data, workers_num):
+    match_stats = calculate_matches(data, workers_num)
+    top_k_hits = match_stats.top_k_hits
+    print("Validation results: top k documents hits %s", top_k_hits)
+    top_k_hits = [v / len(data) for v in top_k_hits]
+    message = ""
+    for k in [5, 10, 20, 100]:
+        if k <= len(top_k_hits):
+            message += f"R@{k}: {top_k_hits[k-1]} "
+    print(message)
+    return match_stats.questions_doc_hits
+def add_passages(data, passages, top_passages_and_scores):
+    # add passages to original data
+    merged_data = []
+    assert len(data) == len(top_passages_and_scores)
+    for i, d in enumerate(data):
+        results_and_scores = top_passages_and_scores[i]
+        docs = [passages[doc_id] for doc_id in results_and_scores[0]]
+        scores = [str(score) for score in results_and_scores[1]]
+        ctxs_num = len(docs)
+        d["ctxs"] = [
+            {
+                "id": results_and_scores[0][c],
+                "title": docs[c]["title"],
+                "text": docs[c]["text"],
+                "score": scores[c],
+            }
+            for c in range(ctxs_num)
+        ]
+def add_hasanswer(data, hasanswer):
+    # add hasanswer to data
+    for i, ex in enumerate(data):
+        for k, d in enumerate(ex["ctxs"]):
+            d["hasanswer"] = hasanswer[i][k]
+def load_data(data_path):
+    if data_path.endswith(".json"):
+        with open(data_path, "r") as fin:
+            data = json.load(fin)
+    elif data_path.endswith(".jsonl"):
+        data = []
+        with open(data_path, "r") as fin:
+            for k, example in enumerate(fin):
+                example = json.loads(example)
+                data.append(example)
+    return data
+def main(args):
+    print(f"Loading model from: {args.model_name_or_path}")
+    model, tokenizer, _ = src.contriever.load_retriever(args.model_name_or_path)
+    model.eval()
+    model = model.cuda()
+    if not args.no_fp16:
+        model = model.half()
+    index = src.index.Indexer(args.projection_size, args.n_subquantizers, args.n_bits)
+    # index all passages
+    input_paths = glob.glob(args.passages_embeddings)
+    input_paths = sorted(input_paths)
+    embeddings_dir = os.path.dirname(input_paths[0])
+    index_path = os.path.join(embeddings_dir, "index.faiss")
+    if args.save_or_load_index and os.path.exists(index_path):
+        index.deserialize_from(embeddings_dir)
+    else:
+        print(f"Indexing passages from files {input_paths}")
+        start_time_indexing = time.time()
+        index_encoded_data(index, input_paths, args.indexing_batch_size)
+        print(f"Indexing time: {time.time()-start_time_indexing:.1f} s.")
+        if args.save_or_load_index:
+            index.serialize(embeddings_dir)
+    # load passages
+    passages = src.data.load_passages(args.passages)
+    passage_id_map = {x["id"]: x for x in passages}
+    data_paths = glob.glob(args.data)
+    alldata = []
+    for path in data_paths:
+        data = load_data(path)
+        output_path = os.path.join(args.output_dir, os.path.basename(path))
+        queries = [ex["question"] for ex in data]
+        questions_embedding = embed_queries(args, queries, model, tokenizer)
+        # get top k results
+        start_time_retrieval = time.time()
+        top_ids_and_scores = index.search_knn(questions_embedding, args.n_docs)
+        print(f"Search time: {time.time()-start_time_retrieval:.1f} s.")
+        add_passages(data, passage_id_map, top_ids_and_scores)
+        hasanswer = validate(data, args.validation_workers)
+        add_hasanswer(data, hasanswer)
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, "w") as fout:
+            for ex in data:
+                json.dump(ex, fout, ensure_ascii=False)
+                fout.write("\n")
+        print(f"Saved results to {output_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data",
+        required=True,
+        type=str,
+        default=None,
+        help=".json file containing question and answers, similar format to reader data",
+    )
+    parser.add_argument("--passages", type=str, default=None, help="Path to passages (.tsv file)")
+    parser.add_argument("--passages_embeddings", type=str, default=None, help="Glob path to encoded passages")
+    parser.add_argument(
+        "--output_dir", type=str, default=None, help="Results are written to outputdir with data suffix"
+    )
+    parser.add_argument("--n_docs", type=int, default=100, help="Number of documents to retrieve per questions")
+    parser.add_argument(
+        "--validation_workers", type=int, default=32, help="Number of parallel processes to validate results"
+    )
+    parser.add_argument("--per_gpu_batch_size", type=int, default=64, help="Batch size for question encoding")
+    parser.add_argument(
+        "--save_or_load_index", action="store_true", help="If enabled, save index and load index if it exists"
+    )
+    parser.add_argument(
+        "--model_name_or_path", type=str, help="path to directory containing model weights and config file"
+    )
+    parser.add_argument("--no_fp16", action="store_true", help="inference in fp32")
+    parser.add_argument("--question_maxlength", type=int, default=512, help="Maximum number of tokens in a question")
+    parser.add_argument(
+        "--indexing_batch_size", type=int, default=1000000, help="Batch size of the number of passages indexed"
+    )
+    parser.add_argument("--projection_size", type=int, default=768)
+    parser.add_argument(
+        "--n_subquantizers",
+        type=int,
+        default=0,
+        help="Number of subquantizer used for vector quantization, if 0 flat index is used",
+    )
+    parser.add_argument("--n_bits", type=int, default=8, help="Number of bits per subquantizer")
+    parser.add_argument("--lang", nargs="+")
+    parser.add_argument("--dataset", type=str, default="none")
+    parser.add_argument("--lowercase", action="store_true", help="lowercase text before encoding")
+    parser.add_argument("--normalize_text", action="store_true", help="normalize text")
+    args = parser.parse_args()
+    src.slurm.init_distributed_mode(args)
+    main(args)

sentence-transformers/preprocess.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import os
+import argparse
+import torch
+import transformers
+from src.normalize_text import normalize
+def save(tensor, split_path):
+    if not os.path.exists(os.path.dirname(split_path)):
+        os.makedirs(os.path.dirname(split_path))
+    with open(split_path, 'wb') as fout:
+        torch.save(tensor, fout)
+def apply_tokenizer(path, tokenizer, normalize_text=False):
+    alltokens = []
+    lines = []
+    with open(path, "r", encoding="utf-8") as fin:
+        for k, line in enumerate(fin):
+            if normalize_text:
+                line = normalize(line)
+            lines.append(line)
+            if len(lines) > 1000000:
+                tokens = tokenizer.batch_encode_plus(lines, add_special_tokens=False)['input_ids']
+                tokens = [torch.tensor(x, dtype=torch.int) for x in tokens]
+                alltokens.extend(tokens)
+                lines = []
+    tokens = tokenizer.batch_encode_plus(lines, add_special_tokens=False)['input_ids']
+    tokens = [torch.tensor(x, dtype=torch.int) for x in tokens]
+    alltokens.extend(tokens)
+    alltokens = torch.cat(alltokens)
+    return alltokens
+def tokenize_file(args):
+    filename = os.path.basename(args.datapath)
+    savepath = os.path.join(args.outdir, f"{filename}.pkl")
+    if os.path.exists(savepath):
+        if args.overwrite:
+            print(f"File {savepath} already exists, overwriting")
+        else:
+            print(f"File {savepath} already exists, exiting")
+            return
+    try:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer, local_files_only=True)
+    except:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer, local_files_only=False)
+    print(f"Encoding {args.datapath}...")
+    tokens = apply_tokenizer(args.datapath, tokenizer, normalize_text=args.normalize_text)
+    print(f"Saving at {savepath}...")
+    save(tokens, savepath)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--datapath", type=str)
+    parser.add_argument("--outdir", type=str)
+    parser.add_argument("--tokenizer", type=str)
+    parser.add_argument("--overwrite", action="store_true")
+    parser.add_argument("--normalize_text", action="store_true")
+    args, _ = parser.parse_known_args()
+    tokenize_file(args)

sentence-transformers/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+transformers>=4.6.0,<5.0.0
+tokenizers>=0.10.3
+tqdm
+torch>=1.6.0
+torchvision
+numpy
+scikit-learn
+scipy
+nltk
+sentencepiece
+huggingface-hub

sentence-transformers/setup.cfg ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [metadata]
2	+ description-file = README.md

sentence-transformers/setup.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from setuptools import setup, find_packages
+with open("README.md", mode="r", encoding="utf-8") as readme_file:
+    readme = readme_file.read()
+setup(
+    name="sentence-transformers",
+    version="2.2.2",
+    author="Nils Reimers",
+    author_email="info@nils-reimers.de",
+    description="Multilingual text embeddings",
+    long_description=readme,
+    long_description_content_type="text/markdown",
+    license="Apache License 2.0",
+    url="https://www.SBERT.net",
+    download_url="https://github.com/UKPLab/sentence-transformers/",
+    packages=find_packages(),
+    python_requires=">=3.6.0",
+    install_requires=[
+        'transformers>=4.6.0,<5.0.0',
+        'tqdm',
+        'torch>=1.6.0',
+        'torchvision',
+        'numpy',
+        'scikit-learn',
+        'scipy',
+        'nltk',
+        'sentencepiece',
+        'huggingface-hub>=0.4.0'
+    ],
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3.6",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence"
+    ],
+    keywords="Transformer Networks BERT XLNet sentence embedding PyTorch NLP deep learning"
+)

sentence-transformers/train.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import os
+import time
+import sys
+import torch
+import logging
+import json
+import numpy as np
+import random
+import pickle
+import torch.distributed as dist
+from torch.utils.data import DataLoader, RandomSampler
+from src.options import Options
+from src import data, beir_utils, slurm, dist_utils, utils
+from src import moco, inbatch
+logger = logging.getLogger(__name__)
+def train(opt, model, optimizer, scheduler, step):
+    run_stats = utils.WeightedAvgStats()
+    tb_logger = utils.init_tb_logger(opt.output_dir)
+    logger.info("Data loading")
+    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        tokenizer = model.module.tokenizer
+    else:
+        tokenizer = model.tokenizer
+    collator = data.Collator(opt=opt)
+    train_dataset = data.load_data(opt, tokenizer)
+    logger.warning(f"Data loading finished for rank {dist_utils.get_rank()}")
+    train_sampler = RandomSampler(train_dataset)
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=train_sampler,
+        batch_size=opt.per_gpu_batch_size,
+        drop_last=True,
+        num_workers=opt.num_workers,
+        collate_fn=collator,
+    )
+    epoch = 1
+    model.train()
+    while step < opt.total_steps:
+        train_dataset.generate_offset()
+        logger.info(f"Start epoch {epoch}")
+        for i, batch in enumerate(train_dataloader):
+            step += 1
+            batch = {key: value.cuda() if isinstance(value, torch.Tensor) else value for key, value in batch.items()}
+            train_loss, iter_stats = model(**batch, stats_prefix="train")
+            train_loss.backward()
+            optimizer.step()
+            scheduler.step()
+            model.zero_grad()
+            run_stats.update(iter_stats)
+            if step % opt.log_freq == 0:
+                log = f"{step} / {opt.total_steps}"
+                for k, v in sorted(run_stats.average_stats.items()):
+                    log += f" | {k}: {v:.3f}"
+                    if tb_logger:
+                        tb_logger.add_scalar(k, v, step)
+                log += f" | lr: {scheduler.get_last_lr()[0]:0.3g}"
+                log += f" | Memory: {torch.cuda.max_memory_allocated()//1e9} GiB"
+                logger.info(log)
+                run_stats.reset()
+            if step % opt.eval_freq == 0:
+                if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+                    encoder = model.module.get_encoder()
+                else:
+                    encoder = model.get_encoder()
+                eval_model(
+                    opt, query_encoder=encoder, doc_encoder=encoder, tokenizer=tokenizer, tb_logger=tb_logger, step=step
+                )
+                if dist_utils.is_main():
+                    utils.save(model, optimizer, scheduler, step, opt, opt.output_dir, f"lastlog")
+                model.train()
+            if dist_utils.is_main() and step % opt.save_freq == 0:
+                utils.save(model, optimizer, scheduler, step, opt, opt.output_dir, f"step-{step}")
+            if step > opt.total_steps:
+                break
+        epoch += 1
+def eval_model(opt, query_encoder, doc_encoder, tokenizer, tb_logger, step):
+    for datasetname in opt.eval_datasets:
+        metrics = beir_utils.evaluate_model(
+            query_encoder,
+            doc_encoder,
+            tokenizer,
+            dataset=datasetname,
+            batch_size=opt.per_gpu_eval_batch_size,
+            norm_doc=opt.norm_doc,
+            norm_query=opt.norm_query,
+            beir_dir=opt.eval_datasets_dir,
+            score_function=opt.score_function,
+            lower_case=opt.lower_case,
+            normalize_text=opt.eval_normalize_text,
+        )
+        message = []
+        if dist_utils.is_main():
+            for metric in ["NDCG@10", "Recall@10", "Recall@100"]:
+                message.append(f"{datasetname}/{metric}: {metrics[metric]:.2f}")
+                if tb_logger is not None:
+                    tb_logger.add_scalar(f"{datasetname}/{metric}", metrics[metric], step)
+            logger.info(" | ".join(message))
+if __name__ == "__main__":
+    logger.info("Start")
+    options = Options()
+    opt = options.parse()
+    torch.manual_seed(opt.seed)
+    slurm.init_distributed_mode(opt)
+    slurm.init_signal_handler()
+    directory_exists = os.path.isdir(opt.output_dir)
+    if dist.is_initialized():
+        dist.barrier()
+    os.makedirs(opt.output_dir, exist_ok=True)
+    if not directory_exists and dist_utils.is_main():
+        options.print_options(opt)
+    if dist.is_initialized():
+        dist.barrier()
+    utils.init_logger(opt)
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    if opt.contrastive_mode == "moco":
+        model_class = moco.MoCo
+    elif opt.contrastive_mode == "inbatch":
+        model_class = inbatch.InBatch
+    else:
+        raise ValueError(f"contrastive mode: {opt.contrastive_mode} not recognised")
+    if not directory_exists and opt.model_path == "none":
+        model = model_class(opt)
+        model = model.cuda()
+        optimizer, scheduler = utils.set_optim(opt, model)
+        step = 0
+    elif directory_exists:
+        model_path = os.path.join(opt.output_dir, "checkpoint", "latest")
+        model, optimizer, scheduler, opt_checkpoint, step = utils.load(
+            model_class,
+            model_path,
+            opt,
+            reset_params=False,
+        )
+        logger.info(f"Model loaded from {opt.output_dir}")
+    else:
+        model, optimizer, scheduler, opt_checkpoint, step = utils.load(
+            model_class,
+            opt.model_path,
+            opt,
+            reset_params=False if opt.continue_training else True,
+        )
+        if not opt.continue_training:
+            step = 0
+        logger.info(f"Model loaded from {opt.model_path}")
+    logger.info(utils.get_parameters(model))
+    if dist.is_initialized():
+        model = torch.nn.parallel.DistributedDataParallel(
+            model,
+            device_ids=[opt.local_rank],
+            output_device=opt.local_rank,
+            find_unused_parameters=False,
+        )
+        dist.barrier()
+    logger.info("Start training")
+    train(opt, model, optimizer, scheduler, step)