What is CollegeAuditScraper? This model is a fine-tune of Mistral AI's mistralai/Mistral-7B-Instruct-v0.2. It is trained to read college and university audits and extract key financial data from the text.
use case The primary use of this model is to construct a large database of key financial metrics in higher education.
############################################################
################## Example usage of model ##################
############################################################
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import torch
model_id = "PDScience/CollegeAuditScraper_v1.0"
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map='auto',
# trust_remote_code=False,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
print(model)
print(model.num_parameters())
print(tokenizer)
generation_config = GenerationConfig(
max_new_tokens=100,
do_sample=False,
bos_token_id=model.generation_config.bos_token_id,
eos_token_id=model.generation_config.eos_token_id,
pad_token_id=model.generation_config.eos_token_id,
repetition_penalty=1.1,
)
###########################################################
########### Load audit data with pdfplumber ################
###########################################################
# Loop over each file with a progress bar
import pdfplumber
# Load your PDF
pdfFileName = "24983720221.pdf" # put the name of the pdf file here
with pdfplumber.open(pdfFileName) as pdf:
textList = []
for page in pdf.pages:
textList.append(page.extract_text())
articleText = "\n\n".join(textList)
# Keep first 30000 tokens
articleText = tokenizer.decode(tokenizer.encode(articleText)[:30000], skip_special_tokens=True)
#######################################################################
####################### Model prompt and response #####################
#######################################################################
# Create and format prompt
systemPrompt = f"""Extract the total unrestricted operating revenue for fiscal year 2022 from the following text: \n\n"""
messages = [{"role": "user","content": systemPrompt + "\n\n" + articleText}]
formattedPrompt = tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer(formattedPrompt, return_tensors='pt', add_special_tokens=False)
# Move inputs to the same device as model
inputs = inputs.to(model.device)
################# Generate model response #############################
output = model.generate(**inputs, generation_config=generation_config)
# Decode and return the generated text
decodedOutput = tokenizer.decode(output[0], skip_special_tokens=True)
response_fineTuned = decodedOutput.split("[INST]")[1].split("[/INST]")[1]
print("Model Response: \n ", response_fineTuned)
- Downloads last month
- 0