File size: 4,972 Bytes
ae1f24d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from transformers import GPT2Tokenizer, GPT2LMHeadModel\n",
"from sklearn.metrics import accuracy_score, recall_score\n",
"import numpy as np\n",
"from datasets import load_dataset\n",
"from PIL import Image, ImageEnhance\n",
"import os\n",
"import cv2\n",
"from sklearn.preprocessing import LabelEncoder\n",
"import json\n",
"import csv\n",
"import re\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def prepare_dataset(ocr_dir, csv_dir, output_file):\n",
" with open(output_file, 'w', encoding='utf-8') as jsonl_file:\n",
" for filename in os.listdir(ocr_dir):\n",
" if filename.endswith('.txt'):\n",
" ocr_path = os.path.join(ocr_dir, filename)\n",
" csv_path = os.path.join(csv_dir, filename)#.replace('.txt', '.csv'))\n",
" print(csv_path)\n",
" # if not os.path.exists(csv_path):\n",
" # print(f\"Warning: Corresponding CSV file not found for {ocr_path}\")\n",
" # continue\n",
" \n",
" with open(ocr_path, 'r', encoding='utf-8') as ocr_file:\n",
" ocr_text = ocr_file.read()\n",
" \n",
" with open(csv_path, 'r', encoding='utf-8') as csv_file:\n",
" csv_text = csv_file.read()\n",
" \n",
" json_object = {\n",
" \"prompt\": ocr_text,\n",
" \"completion\": csv_text\n",
" }\n",
" jsonl_file.write(json.dumps(json_object) + '\\n')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Usage\n",
"ocr_dir = os.getcwd() + '/../data/processed/annotations'\n",
"csv_dir = os.getcwd() + '/../data/processed/hand_labeled_tables/hand_labeled_tables'\n",
"output_file = 'dataset.jsonl'\n",
"prepare_dataset(ocr_dir, csv_dir, output_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load pre-trained GPT model and tokenizer\n",
"model_name = 'gpt2'\n",
"tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n",
"model = GPT2LMHeadModel.from_pretrained(model_name)\n",
"\n",
"# Ensure the model is in evaluation mode\n",
"model.eval()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def preprocess_text(text):\n",
" # Basic cleaning for OCR text\n",
" text = re.sub(r'\\s+', ' ', text) # Remove extra whitespace\n",
" text = re.sub(r'[^a-zA-Z0-9\\s,.:()%+-]', '', text) # Remove most special characters, but keep some relevant ones\n",
" return text.strip()\n",
"\n",
"def calculate_loss(model, tokenizer, prompt, true_completion):\n",
" # Combine prompt and completion for full context\n",
" full_text = f\"{prompt} {true_completion}\"\n",
" inputs = tokenizer.encode(full_text, return_tensors='pt', truncation=True, max_length=512)\n",
" \n",
" # Calculate loss\n",
" with torch.no_grad():\n",
" outputs = model(inputs, labels=inputs)\n",
" \n",
" return outputs.loss.item()\n",
"\n",
"def evaluate_json_dataset(json_file, model, tokenizer):\n",
" with open(json_file, 'r') as f:\n",
" dataset = [json.loads(line) for line in f]\n",
" \n",
" losses = []\n",
" \n",
" for item in dataset:\n",
" prompt = preprocess_text(item['prompt'])\n",
" completion = preprocess_text(item['completion'])\n",
" \n",
" loss = calculate_loss(model, tokenizer, prompt, completion)\n",
" losses.append(loss)\n",
" \n",
" average_loss = np.mean(losses)\n",
" \n",
" return average_loss"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"average_loss = evaluate_json_dataset('dataset.jsonl', model, tokenizer)\n",
"print(f\"cross-entropy loss: {average_loss:.4f}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "term_project",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|