Fortrain/qw/open_r1/evaluate.py

# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Custom evaluation tasks for LightEval."""

from lighteval.metrics.dynamic_metrics import (
    ExprExtractionConfig,
    LatexExtractionConfig,
    multilingual_extractive_match_metric,
)
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.utils.language import Language


metric = multilingual_extractive_match_metric(
    language=Language.ENGLISH,
    fallback_mode="first_match",
    precision=5,
    gold_extraction_target=(LatexExtractionConfig(),),
    pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),
    aggregation_function=max,
)


def prompt_fn(line, task_name: str = None):
    """Assumes the model is either prompted to emit \\boxed{answer} or does so automatically"""
    return Doc(
        task_name=task_name,
        query=line["problem"],
        choices=[line["solution"]],
        gold_index=0,
    )


# Define tasks
aime24 = LightevalTaskConfig(
    name="aime24",
    suite=["custom"],
    prompt_function=prompt_fn,
    hf_repo="HuggingFaceH4/aime_2024",
    hf_subset="default",
    hf_avail_splits=["train"],
    evaluation_splits=["train"],
    few_shots_split=None,
    few_shots_select=None,
    generation_size=32768,
    metric=[metric],
    version=1,
)
math_500 = LightevalTaskConfig(
    name="math_500",
    suite=["custom"],
    prompt_function=prompt_fn,
    hf_repo="HuggingFaceH4/MATH-500",
    hf_subset="default",
    hf_avail_splits=["test"],
    evaluation_splits=["test"],
    few_shots_split=None,
    few_shots_select=None,
    generation_size=32768,
    metric=[metric],
    version=1,
)

# Add tasks to the table
TASKS_TABLE = []
TASKS_TABLE.append(aime24)
TASKS_TABLE.append(math_500)

# MODULE LOGIC
if __name__ == "__main__":
    print([t["name"] for t in TASKS_TABLE])
    print(len(TASKS_TABLE))
qw和gemma3 grpo 2025-03-31 15:56:36 +08:00			`# Copyright 2025 The HuggingFace Team. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`"""Custom evaluation tasks for LightEval."""`

			`from lighteval.metrics.dynamic_metrics import (`
			`ExprExtractionConfig,`
			`LatexExtractionConfig,`
			`multilingual_extractive_match_metric,`
			`)`
			`from lighteval.tasks.lighteval_task import LightevalTaskConfig`
			`from lighteval.tasks.requests import Doc`
			`from lighteval.utils.language import Language`


			`metric = multilingual_extractive_match_metric(`
			`language=Language.ENGLISH,`
			`fallback_mode="first_match",`
			`precision=5,`
			`gold_extraction_target=(LatexExtractionConfig(),),`
			`pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),`
			`aggregation_function=max,`
			`)`


			`def prompt_fn(line, task_name: str = None):`
			`"""Assumes the model is either prompted to emit \\boxed{answer} or does so automatically"""`
			`return Doc(`
			`task_name=task_name,`
			`query=line["problem"],`
			`choices=[line["solution"]],`
			`gold_index=0,`
			`)`


			`# Define tasks`
			`aime24 = LightevalTaskConfig(`
			`name="aime24",`
			`suite=["custom"],`
			`prompt_function=prompt_fn,`
			`hf_repo="HuggingFaceH4/aime_2024",`
			`hf_subset="default",`
			`hf_avail_splits=["train"],`
			`evaluation_splits=["train"],`
			`few_shots_split=None,`
			`few_shots_select=None,`
			`generation_size=32768,`
			`metric=[metric],`
			`version=1,`
			`)`
			`math_500 = LightevalTaskConfig(`
			`name="math_500",`
			`suite=["custom"],`
			`prompt_function=prompt_fn,`
			`hf_repo="HuggingFaceH4/MATH-500",`
			`hf_subset="default",`
			`hf_avail_splits=["test"],`
			`evaluation_splits=["test"],`
			`few_shots_split=None,`
			`few_shots_select=None,`
			`generation_size=32768,`
			`metric=[metric],`
			`version=1,`
			`)`

			`# Add tasks to the table`
			`TASKS_TABLE = []`
			`TASKS_TABLE.append(aime24)`
			`TASKS_TABLE.append(math_500)`

			`# MODULE LOGIC`
			`if __name__ == "__main__":`
			`print([t["name"] for t in TASKS_TABLE])`
			`print(len(TASKS_TABLE))`