File size: 7,604 Bytes
d0c1c22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# https://huggingface.co/MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli
# from transformers import pipeline
# classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
# sequence_to_classify = "Angela Merkel is a politician in Germany and leader of the CDU"
# candidate_labels = ["politics", "economy", "entertainment", "environment"]
# output = classifier(sequence_to_classify, candidate_labels, multi_label=False)
# print(output)

# from transformers import pipeline

# generator = pipeline("text-generation", model="distilgpt2")
# output = generator("In this course, we will teach you how to")
# print(output)

# https://huggingface.co/bigscience/bloom-560m
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import transformers
# import torch

# model = "bigscience/bloom-560m"

# tokenizer = AutoTokenizer.from_pretrained(model)
# pipeline = transformers.pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     torch_dtype=torch.bfloat16,
#     trust_remote_code=True,
#     device_map="auto",
# )
# sequences = pipeline(
#    "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
#     max_length=200,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     eos_token_id=tokenizer.eos_token_id,
# )
# for seq in sequences:
#     print(f"Result: {seq['generated_text']}")

# https://huggingface.co/bert-base-uncased
# from transformers import pipeline
# unmasker = pipeline('fill-mask', model='bert-base-multilingual-cased')
# output = unmasker("tu es [MASK] homme?")


# named entity recognition
# from transformers import pipeline

# ner = pipeline("ner", grouped_entities=True)
# output = ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")

# https://huggingface.co/facebook/bart-large-cnn
from transformers import pipeline

# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# output = summarizer(
#     """
#     America has changed dramatically during recent years. Not only has the number of 
#     graduates in traditional engineering disciplines such as mechanical, civil, 
#     electrical, chemical, and aeronautical engineering declined, but in most of 
#     the premier American universities engineering curricula now concentrate on 
#     and encourage largely the study of engineering science. As a result, there 
#     are declining offerings in engineering subjects dealing with infrastructure, 
#     the environment, and related issues, and greater concentration on high 
#     technology subjects, largely supporting increasingly complex scientific 
#     developments. While the latter is important, it should not be at the expense 
#     of more traditional engineering.

#     Rapidly developing economies such as China and India, as well as other 
#     industrial countries in Europe and Asia, continue to encourage and advance 
#     the teaching of engineering. Both China and India, respectively, graduate 
#     six and eight times as many traditional engineers as does the United States. 
#     Other industrial countries at minimum maintain their output, while America 
#     suffers an increasingly serious decline in the number of engineering graduates 
#     and a lack of well-educated engineers.
# """
# )

# from transformers import pipeline

# translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
# output = translator("屌")

# print(output)

# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# sequence = "Using a Transformer network is simple"
# tokens = tokenizer.tokenize(sequence)
# print(tokens)
# ids = tokenizer.convert_tokens_to_ids(tokens)
# print(ids)
# decoded_string = tokenizer.decode(ids)
# print(decoded_string)
# print("----------------------")

# sequence = "Using a Transform network are simple"
# tokens = tokenizer.tokenize(sequence)
# print(tokens)
# ids = tokenizer.convert_tokens_to_ids(tokens)
# print(ids)
# decoded_string = tokenizer.decode(ids)
# print(decoded_string)

# import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# sequence = "I’ve been waiting for a HuggingFace course my whole life."
# tokens = tokenizer.tokenize(sequence)
# print(tokens)
# sequence1_ids = tokenizer.convert_tokens_to_ids(tokens)
# print(sequence1_ids)

# sequence = "I hate this so much!"
# tokens = tokenizer.tokenize(sequence)
# print(tokens)
# sequence2_ids = tokenizer.convert_tokens_to_ids(tokens)
# print(sequence2_ids)

# sequence1_ids = [[200, 200, 200]]
# sequence2_ids = [[200, 200]]
# batched_ids = [
#     [1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
#     [1045, 5223, 2023, 2061, 2172, 999, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id],
# ]

# attention_mask = [
#     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
#     [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# ]

# outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
# print(outputs.logits)

# from transformers import AutoTokenizer

# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# sequence = "I've been waiting for a HuggingFace course my whole life."

# model_inputs = tokenizer(sequence)

# print(model_inputs)

# sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
# Will pad the sequences up to the maximum sequence length
# model_inputs = tokenizer(sequences, padding="longest")
# print(model_inputs)
# print("-------------------------")

# Will pad the sequences up to the specified max length
# model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
# print(model_inputs)

# from transformers import AutoTokenizer
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# sequence = "I've been waiting for a HuggingFace course my whole life."

# model_inputs = tokenizer(sequence)
# print("model_inputs = tokenizer(sequence)")
# print(model_inputs)
# print(model_inputs["input_ids"])

# tokens = tokenizer.tokenize(sequence)
# print("tokens = tokenizer.tokenize(sequence)")
# print(tokens)
# ids = tokenizer.convert_tokens_to_ids(tokens)
# print(sequence)
# print(ids)

# import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
# sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
# output = model(**tokens)
# print(output)

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModel.from_pretrained("gpt2")

encoded = tokenizer("Hey!", return_tensors="pt")
result = model(**encoded)
print(result)