LLM Course code errors

I’m following the LLM Course. I’m trying the tutorial code snippets in Google colab. I get the following errors in machine translation, text summarization and named entity recognition. Any help to resolve them will be much appreciated.



Machine Translation

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/tmp/ipykernel_365/531379416.py in <cell line: 0>()
      1 from transformers import pipeline
      2 
----> 3 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")
      4 
      5 translator("Ce cours est produit par Hugging Face.")


/usr/local/lib/python3.12/dist-packages/transformers/pipelines/base.py in check_task(self, task)
   1352                 task = "translation"
   1353                 return task, targeted_task, (tokens[1], tokens[3])
-> 1354             raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")
   1355 
   1356         raise KeyError(

KeyError: "Invalid translation task translation, use 'translation_XX_to_YY' format"

Text Summarization

KeyError                                  Traceback (most recent call last)

/tmp/ipykernel_365/3730791013.py in <cell line: 0>()
      1 from transformers import pipeline
      2 
----> 3 summarizer = pipeline("summarization")
      4 summarizer(
      5     """

2 frames

/usr/local/lib/python3.12/dist-packages/transformers/pipelines/base.py in check_task(self, task)
   1354             raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")
   1355 
-> 1356         raise KeyError(
   1357             f"Unknown task {task}, available tasks are {self.get_supported_tasks() + ['translation_XX_to_YY']}"
   1358         )

KeyError: "Unknown task summarization, available tasks are ['any-to-any', 'audio-classification', 'automatic-speech-recognition', 'depth-estimation', 'document-question-answering', 'feature-extraction', 'fill-mask', 'image-classification', 'image-feature-extraction', 'image-segmentation', 'image-text-to-text', 'image-to-image', 'keypoint-matching', 'mask-generation', 'ner', 'object-detection', 'question-answering', 'sentiment-analysis', 'table-question-answering', 'text-classification', 'text-generation', 'text-to-audio', 'text-to-speech', 'token-classification', 'video-classification', 'visual-question-answering', 'vqa', 'zero-shot-audio-classification', 'zero-shot-classification', 'zero-shot-image-classification', 'zero-shot-object-detection', 'translation_XX_to_YY']"

Named Entity Recognition

TypeError                                 Traceback (most recent call last)

/tmp/ipykernel_365/3280479951.py in <cell line: 0>()
      1 from transformers import pipeline
      2 
----> 3 ner = pipeline("ner", grouped_entities=True)
      4 
      5 ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")

2 frames

/usr/local/lib/python3.12/dist-packages/transformers/pipelines/base.py in _init_(self, model, tokenizer, feature_extractor, image_processor, processor, task, device, binary_output, **kwargs)
919 self._batch_size = kwargs.pop(ā€œbatch_sizeā€, None)
920 self._num_workers = kwargs.pop(ā€œnum_workersā€, None)
→ 921 self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
922
923 # In processor only mode, we can get the modality processors from the processor

TypeError: TokenClassificationPipeline._sanitize_parameters() got an unexpected keyword argument ā€˜grouped_entities’

1 Like

The error is caused by those pipelines being deprecated in Transformers v5, so the simplest workaround is !pip install ā€œtransformers<5ā€.

If you need to rewrite it for v5-compatible code, it involves more steps…

Hi @John6666, I tried that. It says ā€œ/bin/bash: line 1: 5: No such file or directoryā€

1 Like

Oh, if with bash, just try pip install ā€œtransformers<5ā€ (without !).

@John6666 I tried that. It’s not working. Same message.

1 Like
# colab_notebook_cells.py

# =========================
# a) FIX FOR THE COURSE (Transformers v4)
# =========================
# In Colab, you MUST quote "<5" or bash will treat "<" as input redirection.
# After installing, Runtime -> Restart runtime (important).

# --- RUN THIS CELL, THEN RESTART RUNTIME ---
# !pip -q install -U "transformers<5" "accelerate" "sentencepiece"

# --- AFTER RESTART, RUN THIS ---
def run_v4_pipelines():
    import sys
    import transformers
    from transformers import pipeline

    print("python:", sys.version)
    print("transformers:", transformers.__version__)

    # Machine Translation (FR -> EN)
    translator = pipeline("translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en")
    print(translator("Ce cours est produit par Hugging Face."))

    # Text Summarization
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    long_text = (
        "Hugging Face provides open-source tools for natural language processing. "
        "These tools make it easier to train and deploy transformer models. "
        "Pipelines offer a simple API for inference across tasks like summarization."
    )
    print(summarizer(long_text, max_length=60, min_length=15, do_sample=False))

    # Named Entity Recognition
    ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
    print(ner("My name is Sylvain and I work at Hugging Face in Brooklyn."))


# =========================
# b) STAY ON TRANSFORMERS v5 (use generate() for seq2seq tasks)
# =========================
def _device():
    import torch
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")


def _seq2seq_generate(model_id: str, text: str, *, max_new_tokens: int = 64, num_beams: int = 4) -> str:
    import torch
    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

    device = _device()
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)
    model.eval()

    inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            early_stopping=True,
        )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


def run_v5_generate_and_ner():
    import sys
    import transformers
    from transformers import pipeline

    print("python:", sys.version)
    print("transformers:", transformers.__version__)

    # Machine Translation (FR -> EN) via generate()
    mt = _seq2seq_generate(
        "Helsinki-NLP/opus-mt-fr-en",
        "Ce cours est produit par Hugging Face.",
        max_new_tokens=64,
    )
    print({"translation_text": mt})

    # Summarization via generate()
    summary = _seq2seq_generate(
        "facebook/bart-large-cnn",
        (
            "Hugging Face provides open-source tools for natural language processing. "
            "These tools make it easier to train and deploy transformer models. "
            "Pipelines offer a simple API for inference across tasks like summarization."
        ),
        max_new_tokens=60,
    )
    print({"summary_text": summary})

    # Named Entity Recognition (grouped_entities is removed; use aggregation_strategy)
    ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
    print(ner("My name is Sylvain and I work at Hugging Face in Brooklyn."))


# =========================
# AUTO-RUN (pick based on installed major version)
# =========================
def run_everything():
    import transformers

    major = int(transformers.__version__.split(".", 1)[0])
    if major >= 5:
        run_v5_generate_and_ner()
    else:
        run_v4_pipelines()


# Call this in a Colab cell:
# run_everything()
1 Like