Fortrain/qw/open_r1/utils/hub.py

#!/usr/bin/env python
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import re
from concurrent.futures import Future

from transformers import AutoConfig

from huggingface_hub import (
    create_branch,
    create_repo,
    get_safetensors_metadata,
    list_repo_commits,
    list_repo_files,
    list_repo_refs,
    repo_exists,
    upload_folder,
)
from trl import GRPOConfig, SFTConfig


logger = logging.getLogger(__name__)


def push_to_hub_revision(training_args: SFTConfig | GRPOConfig, extra_ignore_patterns=[]) -> Future:
    """Pushes the model to branch on a Hub repo."""

    # Create a repo if it doesn't exist yet
    repo_url = create_repo(repo_id=training_args.hub_model_id, private=True, exist_ok=True)
    # Get initial commit to branch from
    initial_commit = list_repo_commits(training_args.hub_model_id)[-1]
    # Now create the branch we'll be pushing to
    create_branch(
        repo_id=training_args.hub_model_id,
        branch=training_args.hub_model_revision,
        revision=initial_commit.commit_id,
        exist_ok=True,
    )
    logger.info(f"Created target repo at {repo_url}")
    logger.info(f"Pushing to the Hub revision {training_args.hub_model_revision}...")
    ignore_patterns = ["checkpoint-*", "*.pth"]
    ignore_patterns.extend(extra_ignore_patterns)
    future = upload_folder(
        repo_id=training_args.hub_model_id,
        folder_path=training_args.output_dir,
        revision=training_args.hub_model_revision,
        commit_message=f"Add {training_args.hub_model_revision} checkpoint",
        ignore_patterns=ignore_patterns,
        run_as_future=True,
    )
    logger.info(f"Pushed to {repo_url} revision {training_args.hub_model_revision} successfully!")

    return future


def check_hub_revision_exists(training_args: SFTConfig | GRPOConfig):
    """Checks if a given Hub revision exists."""
    if repo_exists(training_args.hub_model_id):
        if training_args.push_to_hub_revision is True:
            # First check if the revision exists
            revisions = [rev.name for rev in list_repo_refs(training_args.hub_model_id).branches]
            # If the revision exists, we next check it has a README file
            if training_args.hub_model_revision in revisions:
                repo_files = list_repo_files(
                    repo_id=training_args.hub_model_id, revision=training_args.hub_model_revision
                )
                if "README.md" in repo_files and training_args.overwrite_hub_revision is False:
                    raise ValueError(
                        f"Revision {training_args.hub_model_revision} already exists. "
                        "Use --overwrite_hub_revision to overwrite it."
                    )


def get_param_count_from_repo_id(repo_id: str) -> int:
    """Function to get model param counts from safetensors metadata or find patterns like 42m, 1.5b, 0.5m or products like 8x7b in a repo ID."""
    try:
        metadata = get_safetensors_metadata(repo_id)
        return list(metadata.parameter_count.values())[0]
    except Exception:
        # Pattern to match products (like 8x7b) and single values (like 42m)
        pattern = r"((\d+(\.\d+)?)(x(\d+(\.\d+)?))?)([bm])"
        matches = re.findall(pattern, repo_id.lower())

        param_counts = []
        for full_match, number1, _, _, number2, _, unit in matches:
            if number2:  # If there's a second number, it's a product
                number = float(number1) * float(number2)
            else:  # Otherwise, it's a single value
                number = float(number1)

            if unit == "b":
                number *= 1_000_000_000  # Convert to billion
            elif unit == "m":
                number *= 1_000_000  # Convert to million

            param_counts.append(number)

        if len(param_counts) > 0:
            # Return the largest number
            return int(max(param_counts))
        else:
            # Return -1 if no match found
            return -1


def get_gpu_count_for_vllm(model_name: str, revision: str = "main", num_gpus: int = 8) -> int:
    """vLLM enforces a constraint that the number of attention heads must be divisible by the number of GPUs and 64 must be divisible by the number of GPUs.
    This function calculates the number of GPUs to use for decoding based on the number of attention heads in the model.
    """
    config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)
    # Get number of attention heads
    num_heads = config.num_attention_heads
    # Reduce num_gpus so that num_heads is divisible by num_gpus and 64 is divisible by num_gpus
    while num_heads % num_gpus != 0 or 64 % num_gpus != 0:
        logger.info(f"Reducing num_gpus from {num_gpus} to {num_gpus - 1} to make num_heads divisible by num_gpus")
        num_gpus -= 1
    return num_gpus
qw和gemma3 grpo 2025-03-31 15:56:36 +08:00			`#!/usr/bin/env python`
			`# coding=utf-8`
			`# Copyright 2025 The HuggingFace Inc. team. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import logging`
			`import re`
			`from concurrent.futures import Future`

			`from transformers import AutoConfig`

			`from huggingface_hub import (`
			`create_branch,`
			`create_repo,`
			`get_safetensors_metadata,`
			`list_repo_commits,`
			`list_repo_files,`
			`list_repo_refs,`
			`repo_exists,`
			`upload_folder,`
			`)`
			`from trl import GRPOConfig, SFTConfig`


			`logger = logging.getLogger(__name__)`


			`def push_to_hub_revision(training_args: SFTConfig \| GRPOConfig, extra_ignore_patterns=[]) -> Future:`
			`"""Pushes the model to branch on a Hub repo."""`

			`# Create a repo if it doesn't exist yet`
			`repo_url = create_repo(repo_id=training_args.hub_model_id, private=True, exist_ok=True)`
			`# Get initial commit to branch from`
			`initial_commit = list_repo_commits(training_args.hub_model_id)[-1]`
			`# Now create the branch we'll be pushing to`
			`create_branch(`
			`repo_id=training_args.hub_model_id,`
			`branch=training_args.hub_model_revision,`
			`revision=initial_commit.commit_id,`
			`exist_ok=True,`
			`)`
			`logger.info(f"Created target repo at {repo_url}")`
			`logger.info(f"Pushing to the Hub revision {training_args.hub_model_revision}...")`
			`ignore_patterns = ["checkpoint-", ".pth"]`
			`ignore_patterns.extend(extra_ignore_patterns)`
			`future = upload_folder(`
			`repo_id=training_args.hub_model_id,`
			`folder_path=training_args.output_dir,`
			`revision=training_args.hub_model_revision,`
			`commit_message=f"Add {training_args.hub_model_revision} checkpoint",`
			`ignore_patterns=ignore_patterns,`
			`run_as_future=True,`
			`)`
			`logger.info(f"Pushed to {repo_url} revision {training_args.hub_model_revision} successfully!")`

			`return future`


			`def check_hub_revision_exists(training_args: SFTConfig \| GRPOConfig):`
			`"""Checks if a given Hub revision exists."""`
			`if repo_exists(training_args.hub_model_id):`
			`if training_args.push_to_hub_revision is True:`
			`# First check if the revision exists`
			`revisions = [rev.name for rev in list_repo_refs(training_args.hub_model_id).branches]`
			`# If the revision exists, we next check it has a README file`
			`if training_args.hub_model_revision in revisions:`
			`repo_files = list_repo_files(`
			`repo_id=training_args.hub_model_id, revision=training_args.hub_model_revision`
			`)`
			`if "README.md" in repo_files and training_args.overwrite_hub_revision is False:`
			`raise ValueError(`
			`f"Revision {training_args.hub_model_revision} already exists. "`
			`"Use --overwrite_hub_revision to overwrite it."`
			`)`


			`def get_param_count_from_repo_id(repo_id: str) -> int:`
			`"""Function to get model param counts from safetensors metadata or find patterns like 42m, 1.5b, 0.5m or products like 8x7b in a repo ID."""`
			`try:`
			`metadata = get_safetensors_metadata(repo_id)`
			`return list(metadata.parameter_count.values())[0]`
			`except Exception:`
			`# Pattern to match products (like 8x7b) and single values (like 42m)`
			`pattern = r"((\d+(\.\d+)?)(x(\d+(\.\d+)?))?)([bm])"`
			`matches = re.findall(pattern, repo_id.lower())`

			`param_counts = []`
			`for full_match, number1, _, _, number2, _, unit in matches:`
			`if number2: # If there's a second number, it's a product`
			`number = float(number1) * float(number2)`
			`else: # Otherwise, it's a single value`
			`number = float(number1)`

			`if unit == "b":`
			`number *= 1_000_000_000 # Convert to billion`
			`elif unit == "m":`
			`number *= 1_000_000 # Convert to million`

			`param_counts.append(number)`

			`if len(param_counts) > 0:`
			`# Return the largest number`
			`return int(max(param_counts))`
			`else:`
			`# Return -1 if no match found`
			`return -1`


			`def get_gpu_count_for_vllm(model_name: str, revision: str = "main", num_gpus: int = 8) -> int:`
			`"""vLLM enforces a constraint that the number of attention heads must be divisible by the number of GPUs and 64 must be divisible by the number of GPUs.`
			`This function calculates the number of GPUs to use for decoding based on the number of attention heads in the model.`
			`"""`
			`config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)`
			`# Get number of attention heads`
			`num_heads = config.num_attention_heads`
			`# Reduce num_gpus so that num_heads is divisible by num_gpus and 64 is divisible by num_gpus`
			`while num_heads % num_gpus != 0 or 64 % num_gpus != 0:`
			`logger.info(f"Reducing num_gpus from {num_gpus} to {num_gpus - 1} to make num_heads divisible by num_gpus")`
			`num_gpus -= 1`
			`return num_gpus`