echo-chatbot / chapter1_2.py
chanhen's picture
Upload folder using huggingface_hub
d0c1c22 verified
# https://huggingface.co/MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli
# from transformers import pipeline
# classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
# sequence_to_classify = "Angela Merkel is a politician in Germany and leader of the CDU"
# candidate_labels = ["politics", "economy", "entertainment", "environment"]
# output = classifier(sequence_to_classify, candidate_labels, multi_label=False)
# print(output)
# from transformers import pipeline
# generator = pipeline("text-generation", model="distilgpt2")
# output = generator("In this course, we will teach you how to")
# print(output)
# https://huggingface.co/bigscience/bloom-560m
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import transformers
# import torch
# model = "bigscience/bloom-560m"
# tokenizer = AutoTokenizer.from_pretrained(model)
# pipeline = transformers.pipeline(
# "text-generation",
# model=model,
# tokenizer=tokenizer,
# torch_dtype=torch.bfloat16,
# trust_remote_code=True,
# device_map="auto",
# )
# sequences = pipeline(
# "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
# max_length=200,
# do_sample=True,
# top_k=10,
# num_return_sequences=1,
# eos_token_id=tokenizer.eos_token_id,
# )
# for seq in sequences:
# print(f"Result: {seq['generated_text']}")
# https://huggingface.co/bert-base-uncased
# from transformers import pipeline
# unmasker = pipeline('fill-mask', model='bert-base-multilingual-cased')
# output = unmasker("tu es [MASK] homme?")
# named entity recognition
# from transformers import pipeline
# ner = pipeline("ner", grouped_entities=True)
# output = ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")
# https://huggingface.co/facebook/bart-large-cnn
from transformers import pipeline
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# output = summarizer(
# """
# America has changed dramatically during recent years. Not only has the number of
# graduates in traditional engineering disciplines such as mechanical, civil,
# electrical, chemical, and aeronautical engineering declined, but in most of
# the premier American universities engineering curricula now concentrate on
# and encourage largely the study of engineering science. As a result, there
# are declining offerings in engineering subjects dealing with infrastructure,
# the environment, and related issues, and greater concentration on high
# technology subjects, largely supporting increasingly complex scientific
# developments. While the latter is important, it should not be at the expense
# of more traditional engineering.
# Rapidly developing economies such as China and India, as well as other
# industrial countries in Europe and Asia, continue to encourage and advance
# the teaching of engineering. Both China and India, respectively, graduate
# six and eight times as many traditional engineers as does the United States.
# Other industrial countries at minimum maintain their output, while America
# suffers an increasingly serious decline in the number of engineering graduates
# and a lack of well-educated engineers.
# """
# )
# from transformers import pipeline
# translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
# output = translator("屌")
# print(output)
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# sequence = "Using a Transformer network is simple"
# tokens = tokenizer.tokenize(sequence)
# print(tokens)
# ids = tokenizer.convert_tokens_to_ids(tokens)
# print(ids)
# decoded_string = tokenizer.decode(ids)
# print(decoded_string)
# print("----------------------")
# sequence = "Using a Transform network are simple"
# tokens = tokenizer.tokenize(sequence)
# print(tokens)
# ids = tokenizer.convert_tokens_to_ids(tokens)
# print(ids)
# decoded_string = tokenizer.decode(ids)
# print(decoded_string)
# import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
# sequence = "I’ve been waiting for a HuggingFace course my whole life."
# tokens = tokenizer.tokenize(sequence)
# print(tokens)
# sequence1_ids = tokenizer.convert_tokens_to_ids(tokens)
# print(sequence1_ids)
# sequence = "I hate this so much!"
# tokens = tokenizer.tokenize(sequence)
# print(tokens)
# sequence2_ids = tokenizer.convert_tokens_to_ids(tokens)
# print(sequence2_ids)
# sequence1_ids = [[200, 200, 200]]
# sequence2_ids = [[200, 200]]
# batched_ids = [
# [1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
# [1045, 5223, 2023, 2061, 2172, 999, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id],
# ]
# attention_mask = [
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# ]
# outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
# print(outputs.logits)
# from transformers import AutoTokenizer
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# sequence = "I've been waiting for a HuggingFace course my whole life."
# model_inputs = tokenizer(sequence)
# print(model_inputs)
# sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
# Will pad the sequences up to the maximum sequence length
# model_inputs = tokenizer(sequences, padding="longest")
# print(model_inputs)
# print("-------------------------")
# Will pad the sequences up to the specified max length
# model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
# print(model_inputs)
# from transformers import AutoTokenizer
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# sequence = "I've been waiting for a HuggingFace course my whole life."
# model_inputs = tokenizer(sequence)
# print("model_inputs = tokenizer(sequence)")
# print(model_inputs)
# print(model_inputs["input_ids"])
# tokens = tokenizer.tokenize(sequence)
# print("tokens = tokenizer.tokenize(sequence)")
# print(tokens)
# ids = tokenizer.convert_tokens_to_ids(tokens)
# print(sequence)
# print(ids)
# import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
# sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
# tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
# output = model(**tokens)
# print(output)
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModel.from_pretrained("gpt2")
encoded = tokenizer("Hey!", return_tensors="pt")
result = model(**encoded)
print(result)