FinBert Model Token Size

#11
by Hirindu - opened

How I can derive one score for each transcript using Pycharm:
Can you please correct my codes:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

Set the base path for the transcript files

basepath = 'D:/ANALYSIS/DATABASE/All'

output_directory = 'C:/Users/Desktop/'
os.makedirs(output_directory, exist_ok=True)

Open the CSV file for writing and write the headers

with open(os.path.join(output_directory, 'FinBert_Sentiments.csv'), 'w', encoding='utf-8', newline='') as content:
writer = csv.writer(content)
writer.writerow(("Firm Name", "Label", "Score"))

# Loop through all files in the base path
for root, dirs, files in os.walk(basepath):
    for file in files:
        if file.endswith('.txt'):
            # Extract firm name from transcript text
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                transcript = f.read().lower()
                match = re.search(r'q\d\s\d{4}\s(.+)', transcript)
                firm_name = match.group(1) if match else ''

                # Split the transcript into chunks of 512 tokens
                max_chunk_size = 512
                chunks = [transcript[i:i + max_chunk_size] for i in range(0, len(transcript), max_chunk_size)]

                for chunk in chunks:
                    # Tokenize and truncate/split input text to fit the model's maximum sequence length
                    tokens = tokenizer.encode_plus(chunk, max_length=1300000000, truncation=True, return_tensors='pt')
                    input_ids = tokens['input_ids']

                    # Perform sentiment analysis on the transcript
                    results = nlp(tokenizer.decode(input_ids[0], skip_special_tokens=True))
                    label = results[0]['label']
                    score = results[0]['score']

                    # Write the results to the CSV file
                    writer.writerow([firm_name, label, score])

Sign up or log in to comment