# Start with downloading & converting PDF transcripts to txt files for the models to read

In [1]:
!pip install pdfminer.six

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from bs4 import BeautifulSoup

In [3]:
import requests
import io
import os
from pdfminer.high_level import extract_text

In [4]:
# URL of the website with the PDF files
url = 'https://readthatpodcast.com/'

In [5]:
# Create a new folder on desktop to save PDF files
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
folder = os.path.join(desktop_path, "hubermantranscript1")
if not os.path.exists(folder):
    os.makedirs(folder)

In [6]:
# Download all the PDF files from the website
response = requests.get('https://readthatpodcast.com')
soup = BeautifulSoup(response.content, 'html.parser')
for link in soup.find_all('a', href=True):
    href = link['href']
    if 'pdf' in href:
        episode_name = link.text.strip()
        filename = f"{episode_name}.pdf"
        file_path = os.path.join(folder, filename)
        with open(file_path, 'wb') as f:
          f.write(requests.get(url + '/' + href).content)

In [19]:
# Convert the PDF files to text
for filename in os.listdir(folder):
    if filename.endswith('.pdf'):
        file_path = os.path.join(folder, filename)
        with open(file_path, 'rb') as f:
            text = extract_text(io.BytesIO(f.read()))
        text_path = os.path.join(folder, filename.replace('.pdf', '.txt'))
        with open(text_path, 'w') as f:
            f.write(text)

PDFSyntaxError: ignored

In [20]:
os.listdir(folder)

['73 Dr Wendy Suzuki Boost Attention & Memory with ScienceBased Tools\r\n          Huberman Lab Podcast 73.pdf',
 '66 Using Deliberate Cold Exposure for Health and Performance Huberman\r\n          Lab Podcast 66.pdf',
 '79 Jeff Cavaliere Optimize Your Exercise Program with ScienceBased\r\n          Tools Huberman Lab Podcast 79.txt',
 '41 Effects of Fasting & Time Restricted Eating on Fat Loss & Health\r\n          Huberman Lab Podcast 41.pdf',
 '78 The Science & Treatment of Obsessive Compulsive Disorder (OCD)\r\n          Huberman Lab Podcast 78.pdf',
 '40 Dr Craig Heller Using Temperature for Performance Brain & Body\r\n          Health Huberman Lab Podcast 40.txt',
 '112 Dr Andy Galpin How to Build Physical Endurance & Lose Fat\r\n          Huberman Lab Guest Series.pdf',
 '118 Dr Andy Galpin Optimal Nutrition & Supplementation for Fitness\r\n          Huberman Lab Guest Series.txt',
 '11 How Foods and Nutrients Control Our Moods Huberman Lab Podcast\r\n          11.txt',
 '32 How

In [21]:
os.getcwd()

'/content'

# cGPT tips om hur vi ska göra

To fine-tune a GPT-2 model on podcast transcripts, your proposed approach is mostly correct. Here are the steps you should follow:

Preprocess the PDF transcripts: You need to extract the text from the PDF files and remove any irrelevant information such as headers and footers. You can use a library like PyPDF2 to extract the text from the PDF files.

Convert the transcripts to a TextDataset: After extracting the text from the PDF files, you can save it in a text file and then use the TextDataset class from the Transformers library to create a dataset.

Split the dataset into train and test sets: You can use the train_test_split function from the scikit-learn library to split the dataset into a training set and a validation set.

Tokenize the dataset: You need to tokenize the text data to convert it into numerical data that the model can understand. You can use the Tokenizer class from the Transformers library to tokenize the text data.

Fine-tune the GPT-2 model: You can use the Trainer class from the Transformers library to fine-tune the GPT-2 model on the podcast transcripts.

# Scraping the transcripts for PDF files

In [1]:
import requests

url = "https://readthatpodcast.com/"
response = requests.get(url)
content = response.content

In [2]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(content, "html.parser")
pdf_links = []
for link in soup.find_all("a"):
  href = link.get("href")
  if href is not None and href.endswith(".pdf"):
    pdf_links.append(href)

In [3]:
import os

base_url = "https://readthatpodcast.com/"
for i, pdf_link in enumerate(pdf_links):
  if not pdf_link.startswith("http"):
    pdf_link = base_url + pdf_link
  response = requests.get(pdf_link)
  filename = f"transcription_{i}.pdf"
  with open(filename, "wb") as f:
    f.write(response.content)
  print(f"Downloaded {filename}")

Downloaded transcription_0.pdf
Downloaded transcription_1.pdf
Downloaded transcription_2.pdf
Downloaded transcription_3.pdf
Downloaded transcription_4.pdf
Downloaded transcription_5.pdf
Downloaded transcription_6.pdf
Downloaded transcription_7.pdf
Downloaded transcription_8.pdf
Downloaded transcription_9.pdf
Downloaded transcription_10.pdf
Downloaded transcription_11.pdf
Downloaded transcription_12.pdf
Downloaded transcription_13.pdf
Downloaded transcription_14.pdf
Downloaded transcription_15.pdf
Downloaded transcription_16.pdf
Downloaded transcription_17.pdf
Downloaded transcription_18.pdf
Downloaded transcription_19.pdf
Downloaded transcription_20.pdf
Downloaded transcription_21.pdf
Downloaded transcription_22.pdf
Downloaded transcription_23.pdf
Downloaded transcription_24.pdf
Downloaded transcription_25.pdf
Downloaded transcription_26.pdf
Downloaded transcription_27.pdf
Downloaded transcription_28.pdf
Downloaded transcription_29.pdf
Downloaded transcription_30.pdf
Downloaded transcr

# Making all the PDF files into one .txt file 

In [4]:
!pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 KB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [5]:
import PyPDF2
import re

text = ''
for i in range(59):
    pdf_file = open(f'transcription_{i}.pdf', 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    for j in range(3, len(pdf_reader.pages)):
        page = pdf_reader.pages[j]
        text += page.extract_text()

text = re.sub(r'\n', ' ', text)

with open('transcripts.txt', 'w') as f:
    f.write(text)


In [6]:
import torch
device = torch.device("cuda")

# Now we can try to follow the german gpt-2 finetuning notebook

In [21]:
# Since we have a .txt file maybe we can skip the TextDataset part of german notebook? he goes from json -> txt with that method

In [7]:
!pip install transformers==4.2.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.2.2
  Downloading transformers-4.2.2-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp39-cp39-manylinux2010_x86_64.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=77b1976a8

In [23]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# Load the text data from file
with open('transcripts.txt', 'r') as f:
    text = f.read()

In [10]:
# Split the data into training and validation sets
train_text, val_text = train_test_split(text, test_size=0.2, random_state=42)

In [11]:
# Convert the lists to strings
train_text_str = '\n'.join(train_text)
val_text_str = '\n'.join(val_text)

# Save the training and validation sets to file
with open('train_text.txt', 'w') as f:
    f.write(train_text_str)

with open('val_text.txt', 'w') as f:
    f.write(val_text_str)

In [12]:
print("Train dataset length: "+str(len(train_text)))
print("Test dataset length: "+ str(len(val_text)))

Train dataset length: 7278223
Test dataset length: 1819556


In [34]:
# Now we have our data, we need to try to tokenize it before we can train the model on it

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2")

model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)

train_path = 'train_text.txt'
test_path = 'val_text.txt'

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [14]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=32)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=32)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (14559065 > 1024). Running this sequence through the model will result in indexing errors


In [15]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("gpt2")


training_args = TrainingArguments(
    output_dir="./gpt2-hubpod", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=8000, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



In [16]:
trainer.train()

Step,Training Loss
500,1.8147
1000,1.5341
1500,1.5195
2000,1.517
2500,1.5182
3000,1.5167
3500,1.5158
4000,1.5169
4500,1.5178
5000,1.5178


KeyboardInterrupt: ignored