In [68]:
from langchain_community.document_loaders import PyPDFLoader

url = 'https://arxiv.org/pdf/1907.11692v1'
loader = PyPDFLoader(url)
pages = loader.load()

len(pages)

13

In [78]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter

split_pages = loader.load_and_split()
text_splitter = RecursiveCharacterTextSplitter(
 chunk_size=300,
 chunk_overlap=50,
 length_function=len,
 is_separator_regex=False,
)
texts = text_splitter.create_documents([page.page_content for page in split_pages], metadatas=[page.metadata for page in split_pages])
print(texts[0])
print(texts[1])

load_dotenv()

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.environ["OPENAI_KEY"])
faiss_index = FAISS.from_documents(texts, embeddings)
retriever = faiss_index.as_retriever(search_type="similarity", search_kwargs={"k": 5})
retrieved_docs = retriever.invoke("RoBERTa surpasses BERT LARGE and XLNet LARGE in performance")

for doc in retrieved_docs:
 print(str(doc.metadata) + ":", doc.page_content[:300])

page_content='arXiv:1907.11692v1 [cs.CL] 26 Jul 2019RoBERTa: A Robustly Optimized BERT Pretraining Approach\nYinhan Liu∗§Myle Ott∗§Naman Goyal∗§Jingfei Du∗§Mandar Joshi†\nDanqi Chen§Omer Levy§Mike Lewis§Luke Zettlemoyer†§Veselin Stoyanov§\n†Paul G. Allen School of Computer Science & Engineering,' metadata={'source': 'https://arxiv.org/pdf/1907.11692v1', 'page': 0}
page_content='University of Washington, Seattle, WA\n{mandar90,lsz }@cs.washington.edu\n§Facebook AI\n{yinhanliu,myleott,naman,jingfeidu,\ndanqi,omerlevy,mikelewis,lsz,ves }@fb.com\nAbstract\nLanguage model pretraining has led to sig-\nnificant performance gains but careful com-' metadata={'source': 'https://arxiv.org/pdf/1907.11692v1', 'page': 0}
{'source': 'https://arxiv.org/pdf/1907.11692v1', 'page': 8}: Single models on test (as of July 25, 2019)
BERT LARGE 72.0 76.6 70.1
XLNet LARGE 81.7 85.4 80.2
RoBERTa 83.2 86.5 81.3
Table 7: Results on the RACE test set. BERT LARGE and
XLNet LARGE results are from Yang et al. (2019 )

In [31]:
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain

template = """
You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.

Question: {question}
Context: {context}
Answer:
"""

def format_docs(docs):
 return "\n\n".join(doc.page_content for doc in docs)

prompt = PromptTemplate(template=template, input_variables=["context", "question"])
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", openai_api_key=os.environ["OPENAI_KEY"])

rag_chain = (
 {"context": retriever | format_docs, "question": RunnablePassthrough()}
 | prompt
 | llm
)

res = rag_chain.invoke("What is RoBERTa?")
res

AIMessage(content='RoBERTa is an improved pretraining approach based on BERT. It involves training the model longer, with bigger batches, over more data, removing the next sentence prediction objective, training on longer sequences, and dynamically changing the masking pattern applied to the training data. RoBERTa achieves state-of-the-art results on various benchmarks like GLUE, RACE, and SQuAD by optimizing key hyperparameters and training data size. It surpasses the performance of models published after BERT.', response_metadata={'token_usage': {'completion_tokens': 98, 'prompt_tokens': 4632, 'total_tokens': 4730}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-0674c381-70d3-4a63-8f45-120f491ec357-0', usage_metadata={'input_tokens': 4632, 'output_tokens': 98, 'total_tokens': 4730})

In [49]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains.llm import LLMChain

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", openai_api_key=os.environ["OPENAI_KEY"])

# Map
map_template = """You are an expert in technical papers and journals.
You're tasked with summarizing the main points in the following text.
The following is the text you need to summarize:
{docs}
Based on this text, provide a summary of the main points.
Helpful Answer:
"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
reduce_template = """The following is set of summaries of a technical paper:
{docs}

Take these and distill it into a final, consolidated summary of the main points. 

RULES:
- The summary should be as if you are presenting the main points in a seminar.
- Organize the points in powerpoint slide format.
- Use markdown to format the text.

Helpful Answer:
"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
 llm_chain=reduce_chain, document_variable_name="docs",
 verbose=True
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
 # This is final chain that is called.
 combine_documents_chain=combine_documents_chain,
 # If documents exceed context for `StuffDocumentsChain`
 collapse_documents_chain=combine_documents_chain,
 # The maximum number of tokens to group documents into.
 token_max=4000,
 verbose=True
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
 # Map chain
 llm_chain=map_chain,
 # Reduce chain
 reduce_documents_chain=reduce_documents_chain,
 # The variable name in the llm_chain to put the documents in
 document_variable_name="docs",
 # Return the results of the map steps in the output
 return_intermediate_steps=False,
 verbose=True
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
 chunk_size=1000, chunk_overlap=100
)
split_docs = text_splitter.split_documents(pages)
print(len(split_docs))

# result = map_reduce_chain.invoke(split_docs)

# print(result["output_text"])


13


In [48]:
from langchain_experimental.text_splitter import SemanticChunker

text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="gradient")
docs = text_splitter.create_documents([' '.join([page.page_content for page in pages])])
print(len(docs))



29


In [60]:
from langchain.globals import set_debug

set_debug(True)

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", openai_api_key=os.environ["OPENAI_KEY"])

# Map
map_template = """You are an expert in technical papers and journals.
You're tasked with summarizing the main points in the following text.
The following is the text you need to summarize:
{doc}
Based on this text, provide a summary of the main points.

RULES:
- Organize the points in markdown format.

Helpful Answer:
"""

map_prompt = PromptTemplate.from_template(map_template)
map_chain = (
 {"doc": RunnablePassthrough()}
 | map_prompt
 | llm
)
map_res = await map_chain.abatch([{'doc': doc.page_content} for doc in docs], config={"max_concurrency": 40})
map_res

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
 "doc": "arXiv:1907.11692v1 [cs.CL] 26 Jul 2019RoBERTa: A Robustly Optimized BERT Pretraining Approach\nYinhan Liu∗§Myle Ott∗§Naman Goyal∗§Jingfei Du∗§Mandar Joshi†\nDanqi Chen§Omer Levy§Mike Lewis§Luke Zettlemoyer†§Veselin Stoyanov§\n†Paul G."
}[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
 "doc": "Allen School of Computer Science & Engineering,\nUniversity of Washington, Seattle, WA\n{mandar90,lsz }@cs.washington.edu\n§Facebook AI\n{yinhanliu,myleott,naman,jingfeidu,\ndanqi,omerlevy,mikelewis,lsz,ves }@fb.com\nAbstract\nLanguage model pretraining has led to sig-\nnificant performance gains but careful com-\nparison between different approaches is chal-\nlenging. Training is computationally expen-\nsive, often done on private datasets of different\nsizes, and, as we will show, hyperparameter\nchoices have significant impact on the final re

[AIMessage(content='### Summary:\n- The document discusses a new pretraining approach called RoBERTa, which is an optimized version of BERT.\n- The authors of the paper are Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov.', response_metadata={'token_usage': {'completion_tokens': 81, 'prompt_tokens': 182, 'total_tokens': 263}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-b3735f08-fdd8-4aa6-8b9e-7c3a6a224ef6-0', usage_metadata={'input_tokens': 182, 'output_tokens': 81, 'total_tokens': 263}),
 AIMessage(content="**Summary:**\n\n- Language model pretraining, such as BERT, has shown significant performance gains in natural language processing tasks.\n- A replication study of BERT pretraining was conducted to evaluate the impact of hyperparameters and training data size.\n- The study found that BERT was significantly undertrained and pr

In [90]:
def concat_summaries(docs):
 print(docs)
 return "\n\n".join(doc.content for doc in docs['docs'])

# Reduce
reduce_template = """The following is set of summaries of a technical paper:
{docs}

Take these and distill it into a final, consolidated summary of the main points. 

RULES:
- The summary should be as if you are presenting the paper in a seminar.
- The outline should include common sections of a technical seminar.
- Organize the points in powerpoint slide format.
- Use markdown to format the text.
- Each point may be technical.
- You may have as many points as you need.

Each slide should follow the following format:
### Slide 2: Slide title
- point 1
- point 2

Helpful Answer:
"""

reduce_prompt = PromptTemplate.from_template(reduce_template)

# Run chain
reduce_chain = (
 {"docs": RunnablePassthrough() | concat_summaries}
 | reduce_prompt
 | llm
)

reduce_res = reduce_chain.invoke({"docs": map_res})
print(reduce_res.content)

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel > chain:RunnableSequence] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel > chain:RunnableSequence > chain:RunnablePassthrough] Entering Chain run with input:
[0m[inputs]
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel > chain:RunnableSequence > chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m[outputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel > chain:RunnableSequence > chain:concat_summaries] Entering Chain run with input:
[0m[inputs]
{'docs': [AIMessage(content='### Summary:\n- The document discuss

In [65]:
beamer_template = """
You are a technical writer for a seminar.
You have been tasked with creating a presentation slide for a seminar.
The following is the content you need to consider:
{content}

Based on this content, create a slide for a presentation.

RULES:
- Use Beamer LaTeX format.
- Each slide should have a title and bullet points.
"""

beamer_prompt = PromptTemplate(template=beamer_template)
beamer_chain = (
 {"content": RunnablePassthrough()}
 | beamer_prompt
 | llm
)

beamer_res = beamer_chain.invoke({"content": reduce_res.content})
print(beamer_res.content)

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
 "content": "## Main Points Summary:\n\n### Slide 1: Introduction\n- **Authors**: Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov\n- Discusses RoBERTa, an optimized version of BERT, for natural language processing tasks\n\n### Slide 2: RoBERTa Improvements\n- Longer training with bigger batches, no next sentence prediction, longer sequences, dynamic masking\n- Used CC-NEWS dataset for better control, achieved state-of-the-art results\n\n### Slide 3: BERT Architecture & Training\n- Transformer architecture, self-attention heads, hidden dimensions\n- Pretraining objectives: masked language modeling, next sentence prediction\n- Optimized with Adam on BOOK CORPUS and English WIKIPEDIA\n\n### Slide 4: RoBERTa Pretraining Details\n- Used dynamic masking, full sentences, large mini-batches, larger byte-level BPE\n- 

In [92]:
import re


def markdown_to_json(markdown):
 regex_pattern = r"### Slide (\d+): (.+)\n((- .+\n)+)"
 matches = re.findall(regex_pattern, markdown)
 slides = []
 for match in matches:
 slide = {
 "slide_number": int(match[0]),
 "slide_title": match[1],
 "points": [{'content': point.strip(), 'sources': retriever.invoke(point.strip())} for point in match[2].split("\n") if point.strip()]
 }
 slides.append(slide)
 return slides

slides = markdown_to_json(reduce_res.content)
slides


[{'slide_number': 1,
 'slide_title': 'Introduction',
 'points': [{'content': '- Discusses advancements in language model pretraining, particularly the RoBERTa approach.',
 'sources': [Document(page_content='arXiv:1907.11692v1 [cs.CL] 26 Jul 2019RoBERTa: A Robustly Optimized BERT Pretraining Approach\nYinhan Liu∗§Myle Ott∗§Naman Goyal∗§Jingfei Du∗§Mandar Joshi†\nDanqi Chen§Omer Levy§Mike Lewis§Luke Zettlemoyer†§Veselin Stoyanov§\n†Paul G. Allen School of Computer Science & Engineering,', metadata={'source': 'https://arxiv.org/pdf/1907.11692v1', 'page': 0}),
 Document(page_content='task development sets. Crucially, RoBERTa uses\nthe same masked language modeling pretrain-\ning objective and architecture as BERT LARGE , yet\nconsistently outperforms both BERT LARGE and\nXLNet LARGE . This raises questions about the rel-\native importance of model architecture and pre-', metadata={'source': 'https://arxiv.org/pdf/1907.11692v1', 'page': 7}),
 Document(page_content='128 tokens and, if needed