Upload hugging-face-model.ipynb
Browse files- hugging-face-model.ipynb +150 -0
hugging-face-model.ipynb
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "0bac3852",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stderr",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"2023-12-06 02:00:53.739133: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
14 |
+
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"name": "stdout",
|
19 |
+
"output_type": "stream",
|
20 |
+
"text": [
|
21 |
+
"[{'label': 'LABEL_1', 'score': 0.7463672161102295}]\n"
|
22 |
+
]
|
23 |
+
}
|
24 |
+
],
|
25 |
+
"source": [
|
26 |
+
"!pip install -q transformers torch\n",
|
27 |
+
"from transformers import pipeline\n",
|
28 |
+
"\n",
|
29 |
+
"model_name = \"XerOpred/twitter-climate-sentiment-model\"\n",
|
30 |
+
"classifier = pipeline('sentiment-analysis', model=model_name)\n",
|
31 |
+
"\n",
|
32 |
+
"text = \"some power and authority u can not spell, let alone define and wield, thas just more evidence of ur arrogant IGNORANCE same as u apply to ur climate change denial THEORY as if u know shit u do not, TOLD U DareDevil does not mean what the hell u think it does, HELL?? been there\"\n",
|
33 |
+
"result = classifier(text)\n",
|
34 |
+
"print(result)"
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"cell_type": "code",
|
39 |
+
"execution_count": 46,
|
40 |
+
"id": "34f09a3e",
|
41 |
+
"metadata": {},
|
42 |
+
"outputs": [],
|
43 |
+
"source": [
|
44 |
+
"from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
|
45 |
+
"import torch\n",
|
46 |
+
"import pandas as pd\n",
|
47 |
+
"\n",
|
48 |
+
"df = pd.read_csv('data/combined-usa.csv' )\n",
|
49 |
+
"\n",
|
50 |
+
"model_name = \"XerOpred/twitter-climate-sentiment-model\"\n",
|
51 |
+
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
52 |
+
"model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
|
53 |
+
"\n",
|
54 |
+
"def sentiment_analysis(model, tokenizer, text):\n",
|
55 |
+
" if not isinstance(text, str):\n",
|
56 |
+
" raise ValueError(\"Input text must be a string.\")\n",
|
57 |
+
"\n",
|
58 |
+
" # tokenize and get model predictions\n",
|
59 |
+
" inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, max_length=512)\n",
|
60 |
+
" with torch.no_grad():\n",
|
61 |
+
" outputs = model(**inputs)\n",
|
62 |
+
"\n",
|
63 |
+
" # extract logits and apply softmax to get probabilities\n",
|
64 |
+
" logits = outputs.logits\n",
|
65 |
+
" probabilities = torch.nn.functional.softmax(logits, dim=-1)\n",
|
66 |
+
"\n",
|
67 |
+
" # determine label and its confidence\n",
|
68 |
+
" predicted_label_idx = torch.argmax(probabilities, dim=1).item()\n",
|
69 |
+
" confidence = probabilities[0][predicted_label_idx].item()\n",
|
70 |
+
" labels = model.config.id2label\n",
|
71 |
+
" predicted_label = labels[predicted_label_idx]\n",
|
72 |
+
"\n",
|
73 |
+
" # sentiment score = positive logit - negative logit\n",
|
74 |
+
" sentiment_score = logits[0][1] - logits[0][0]\n",
|
75 |
+
"\n",
|
76 |
+
" return predicted_label, confidence, sentiment_score.item(), logits[0].tolist()\n",
|
77 |
+
"\n",
|
78 |
+
"def process_in_batches(df, model, tokenizer, batch_size=1000):\n",
|
79 |
+
" batches = [df[i:i + batch_size] for i in range(0, df.shape[0], batch_size)]\n",
|
80 |
+
"\n",
|
81 |
+
" results = []\n",
|
82 |
+
" for batch in batches:\n",
|
83 |
+
" batch_results = batch['Content'].apply(\n",
|
84 |
+
" lambda x: pd.Series(sentiment_analysis(model, tokenizer, str(x)))\n",
|
85 |
+
" )\n",
|
86 |
+
" batch_results.index = batch.index\n",
|
87 |
+
" results.append(batch_results)\n",
|
88 |
+
" \n",
|
89 |
+
" return pd.concat(results)\n",
|
90 |
+
"\n",
|
91 |
+
"# Apply the batch processing function\n",
|
92 |
+
"df[['Label', 'Confidence', 'SentimentScore', 'Logits']] = process_in_batches(df, model, tokenizer, batch_size=1000)\n",
|
93 |
+
"\n",
|
94 |
+
"# Save the DataFrame to a CSV file\n",
|
95 |
+
"df.to_csv('data/distilbert-sentiment-usa-FINAL.csv', index=False)\n",
|
96 |
+
"\n",
|
97 |
+
"# sample_text = \"AnnCoulter Global Warming? Climate Change https://t.co/TYleYPslqu Looks like global warming's the trend Now it's climate change, they changed it again These scientists, they get grants from the gov Theyll say anything, or lose that money they love https://t.co/odBcgDMIfp\"\n",
|
98 |
+
"# predicted_label, confidence, sentiment_score, logits = sentiment_analysis(model, tokenizer, sample_text)\n",
|
99 |
+
"\n",
|
100 |
+
"# print(f\"Label: {predicted_label}\")\n",
|
101 |
+
"# print(f\"Confidence: {confidence}\")\n",
|
102 |
+
"# print(f\"Sentiment score for the text: {sentiment_score}\")\n",
|
103 |
+
"# print(f\"Logits: {logits}\")"
|
104 |
+
]
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"cell_type": "code",
|
108 |
+
"execution_count": 54,
|
109 |
+
"id": "4703a4cf",
|
110 |
+
"metadata": {},
|
111 |
+
"outputs": [],
|
112 |
+
"source": [
|
113 |
+
"# drop duplicates\n",
|
114 |
+
"duplicates_df = pd.read_csv('data/distilbert-sentiment-usa-FINAL.csv', lineterminator='\\n', low_memory=False)\n",
|
115 |
+
"\n",
|
116 |
+
"duplicates_df = duplicates_df.drop_duplicates(subset=['Username', 'Content'], keep='first')\n",
|
117 |
+
"duplicates_df.to_csv('data/distilbert-usa.csv', index=False)"
|
118 |
+
]
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"cell_type": "code",
|
122 |
+
"execution_count": null,
|
123 |
+
"id": "408484cf",
|
124 |
+
"metadata": {},
|
125 |
+
"outputs": [],
|
126 |
+
"source": []
|
127 |
+
}
|
128 |
+
],
|
129 |
+
"metadata": {
|
130 |
+
"kernelspec": {
|
131 |
+
"display_name": "Python 3 (ipykernel)",
|
132 |
+
"language": "python",
|
133 |
+
"name": "python3"
|
134 |
+
},
|
135 |
+
"language_info": {
|
136 |
+
"codemirror_mode": {
|
137 |
+
"name": "ipython",
|
138 |
+
"version": 3
|
139 |
+
},
|
140 |
+
"file_extension": ".py",
|
141 |
+
"mimetype": "text/x-python",
|
142 |
+
"name": "python",
|
143 |
+
"nbconvert_exporter": "python",
|
144 |
+
"pygments_lexer": "ipython3",
|
145 |
+
"version": "3.11.4"
|
146 |
+
}
|
147 |
+
},
|
148 |
+
"nbformat": 4,
|
149 |
+
"nbformat_minor": 5
|
150 |
+
}
|