Translate
""" Deep learning models can `translate` text from one language to another.
One popular library for machine translation is transformers by `Hugging` Face.
It provides easy-to-use interfaces to various `pre-trained` models.
Install required libraries:
pip install transformers
pip install sentencepiece
pip install torch torchvision torchaudio -f \
https://download.pytorch.org/whl/torch_stable.html
pip install sacremoses
"""
from transformers import MarianMTModel, MarianTokenizer
# Load pre-trained model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-ro'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name, use_safetensors=True)
# Text to translate
text = "Everything should be made as simple as possible, but no simpler."
# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt")
# Perform translation
translated = model.generate(**inputs)
# Decode translated text
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
# Output result
print(text)
print(translated_text)
"""
Everything should be made as simple as possible, but no simpler.
Totul ar trebui să fie cât mai simplu posibil, dar nu mai simplu.
"""
Model
""" For loading models and tokenizers an `internet` connection is typically required.
The library `fetches` the necessary model files from the Hugging Face model hub.
Once you've loaded the model and tokenizer at least once and `cached` them locally,
subsequent uses of the same model or tokenizer can often work `offline`.
"""
from transformers import MarianMTModel, MarianTokenizer
# Load pre-trained model and tokenizer
model_name = 'BlackKakapo/opus-mt-ro-en' # Look Here
# Load the tokenizer associated with the pre-trained translation model
tokenizer = MarianTokenizer.from_pretrained(model_name)
# Load the pre-trained MarianMT model for translation
model = MarianMTModel.from_pretrained(model_name)
# Example sentence in Romanian
sentence = "Aceasta este o propoziție de test pentru traducere."
# Tokenize the input text
inputs = tokenizer(sentence, return_tensors="pt")
# Perform translation
translated = model.generate(**inputs)
# Decode translated text
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
# Output result
print(sentence)
print(translated_text)
"""
Aceasta este o propoziție de test pentru traducere.
This is a test sentence for translation.
"""
Sentences
""" To perform batch processing for translation using transformers, you can gather `multiple` sentences.
You ca process them `together` using the generate method of your translation model.
This method should provide `efficiency` improvements compared to translating sentences one by one.
"""
from transformers import MarianMTModel, MarianTokenizer
# Load model and tokenizer
model_name = 'BlackKakapo/opus-mt-ro-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# Example sentences in Romanian
sentences = [
"Aceasta este o propoziție de test pentru traducere.",
"Acesta este un alt exemplu de propoziție.",
"Cât de rapid poate fi acest model?"
]
# Tokenize the input text
inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
# Perform translation
translated = model.generate(**inputs)
# Decode translated texts
translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
# Output results
for i, sentence in enumerate(sentences):
print(i, sentence, translated_texts[i])
"""
0 Aceasta este o propoziție de test pentru traducere. This is a test sentence for translation.
1 Acesta este un alt exemplu de propoziție. This is another example of a sentence.
2 Cât de rapid poate fi acest model? How fast can this model be?
"""
Last update: 2 days ago