test

by iamwyldecat - opened Jun 17, 2025

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+4837

-37800

This view is limited to 50 files because it contains too many changes. See the raw diff here.

Files changed (50) hide show

.github/actionlint.yaml +0 -3
.github/workflows/build-and-commit.yml +0 -120
.github/workflows/pre-commit.yml +0 -30
.github/workflows/push-to-hf.yml +0 -40
.gitignore +0 -21
.pre-commit-config.yaml +0 -33
CLAUDE.md +0 -108
README.md +4 -75
build.toml +14 -24
build/torch210-cxx11-cu126-x86_64-linux/adamw.py +0 -154
build/torch210-cxx11-cu126-x86_64-linux/async_utils.py +0 -77
build/torch210-cxx11-cu126-x86_64-linux/core.py +0 -116
build/torch210-cxx11-cu126-x86_64-linux/distributed/utils.py +0 -234
build/torch210-cxx11-cu126-x86_64-linux/matmul_transpose_triton.py +0 -121
build/torch210-cxx11-cu126-x86_64-linux/metadata.json +0 -3
build/torch210-cxx11-cu126-x86_64-linux/muon.py +0 -594
build/torch210-cxx11-cu126-x86_64-linux/newton_schulz.py +0 -50
build/torch210-cxx11-cu126-x86_64-linux/optimizer/__init__.py +0 -26
build/torch210-cxx11-cu126-x86_64-linux/pipeline.py +0 -390
build/torch210-cxx11-cu126-x86_64-linux/qk_clip.py +0 -129
build/torch210-cxx11-cu128-x86_64-linux/adamw.py +0 -154
build/torch210-cxx11-cu128-x86_64-linux/async_utils.py +0 -77
build/torch210-cxx11-cu128-x86_64-linux/core.py +0 -116
build/torch210-cxx11-cu128-x86_64-linux/distributed/utils.py +0 -234
build/torch210-cxx11-cu128-x86_64-linux/matmul_transpose_triton.py +0 -121
build/torch210-cxx11-cu128-x86_64-linux/metadata.json +0 -3
build/torch210-cxx11-cu128-x86_64-linux/muon.py +0 -594
build/torch210-cxx11-cu128-x86_64-linux/newton_schulz.py +0 -50
build/torch210-cxx11-cu128-x86_64-linux/optimizer/__init__.py +0 -26
build/torch210-cxx11-cu128-x86_64-linux/pipeline.py +0 -390
build/torch210-cxx11-cu128-x86_64-linux/qk_clip.py +0 -129
build/torch210-cxx11-cu130-x86_64-linux/adamw.py +0 -154
build/torch210-cxx11-cu130-x86_64-linux/async_utils.py +0 -77
build/torch210-cxx11-cu130-x86_64-linux/core.py +0 -116
build/torch210-cxx11-cu130-x86_64-linux/distributed/utils.py +0 -234
build/torch210-cxx11-cu130-x86_64-linux/matmul_transpose_triton.py +0 -121
build/torch210-cxx11-cu130-x86_64-linux/metadata.json +0 -3
build/torch210-cxx11-cu130-x86_64-linux/muon.py +0 -594
build/torch210-cxx11-cu130-x86_64-linux/newton_schulz.py +0 -50
build/torch210-cxx11-cu130-x86_64-linux/optimizer/__init__.py +0 -26
build/torch210-cxx11-cu130-x86_64-linux/pipeline.py +0 -390
build/torch210-cxx11-cu130-x86_64-linux/qk_clip.py +0 -129
build/torch210-cxx11-rocm70-x86_64-linux/adamw.py +0 -154
build/torch210-cxx11-rocm70-x86_64-linux/async_utils.py +0 -77
build/torch210-cxx11-rocm70-x86_64-linux/core.py +0 -116
build/torch210-cxx11-rocm70-x86_64-linux/distributed/utils.py +0 -234
build/torch210-cxx11-rocm70-x86_64-linux/matmul_transpose_triton.py +0 -121
build/torch210-cxx11-rocm70-x86_64-linux/metadata.json +0 -3
build/torch210-cxx11-rocm70-x86_64-linux/muon.py +0 -594
build/torch210-cxx11-rocm70-x86_64-linux/newton_schulz.py +0 -50

.github/actionlint.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-self-hosted-runner:
-  labels:
-    - docker-builder-01

.github/workflows/build-and-commit.yml DELETED Viewed

@@ -1,120 +0,0 @@
-name: Nix build and commit
-on:
-  pull_request:
-    types: [opened, synchronize, reopened]
-  workflow_dispatch:
-permissions:
-  contents: write
-jobs:
-  check-commit:
-    runs-on: ubuntu-latest
-    outputs:
-      skip: ${{ steps.check.outputs.skip }}
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - id: check
-        run: |
-          if [ "${{ github.event_name }}" = "pull_request" ]; then
-            msg=$(git log -1 --pretty=%B "${{ github.event.pull_request.head.sha }}")
-          else
-            msg="manual dispatch"
-          fi
-          echo "Commit message: $msg"
-          if echo "$msg" | grep -q '\[skip-build\]'; then
-            echo "skip=true" >> "$GITHUB_OUTPUT"
-          else
-            echo "skip=false" >> "$GITHUB_OUTPUT"
-          fi
-  build_and_commit:
-    needs: check-commit
-    if: needs.check-commit.outputs.skip == 'false'
-    runs-on: docker-builder-01
-    steps:
-      - name: Show disk usage
-        run: df -h
-      - name: Notify build start on Slack
-        id: slack_start
-        run: |
-          msg="*Build started* for \`${{ github.repository }}\`\nBranch: \`${{ github.ref_name }}\`\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Workflow>"
-          response=$(curl -s -X POST \
-            -H "Authorization: Bearer ${{ secrets.SLACK_TOKEN }}" \
-            -H "Content-type: application/json; charset=utf-8" \
-            --data "{\"channel\":\"${{ secrets.SLACK_CHANNEL_ID }}\",\"text\":\"$msg\"}" \
-            https://slack.com/api/chat.postMessage)
-          ts=$(echo "$response" | jq -r '.ts')
-          echo "thread_ts=$ts" >> "$GITHUB_OUTPUT"
-          echo "$response"
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          lfs: true
-          ref: ${{ github.head_ref || github.ref }}
-      - name: Install Nix
-        uses: cachix/install-nix-action@v31
-      - name: Setup huggingface cachix
-        uses: cachix/cachix-action@v15
-        with:
-          name: huggingface
-      - name: Clean build directory
-        run: |
-          rm -rf build
-      - name: Build with Nix
-        run: |
-            nix run .#build-and-copy \
-                --override-input kernel-builder github:huggingface/kernel-builder \
-                --max-jobs 8 \
-                -j 8 \
-                -L
-      - name: List built binaries
-        run: |
-          ls build
-      - name: Commit build artifact
-        run: |
-          git config user.name "github-actions[bot]"
-          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
-          git add build/*
-          git commit -m "Add built binary [skip-build]"
-      - name: Push changes
-        run: |
-          git push origin HEAD:"$HEAD_REF"
-        env:
-          HEAD_REF: ${{ github.head_ref || github.ref }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Notify success on Slack (thread)
-        if: success()
-        run: |
-          ts="${{ steps.slack_start.outputs.thread_ts }}"
-          msg="*Build succeeded* for \`${{ github.repository }}\`\nBranch: \`${{ github.ref_name }}\`\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Workflow>"
-          curl -s -X POST \
-            -H "Authorization: Bearer ${{ secrets.SLACK_TOKEN }}" \
-            -H "Content-type: application/json; charset=utf-8" \
-            --data "{\"channel\":\"${{ secrets.SLACK_CHANNEL_ID }}\",\"text\":\"$msg\",\"thread_ts\":\"$ts\"}" \
-            https://slack.com/api/chat.postMessage
-      - name: Notify failure on Slack (thread)
-        if: failure()
-        run: |
-          ts="${{ steps.slack_start.outputs.thread_ts }}"
-          msg="*Build failed* for \`${{ github.repository }}\`\nBranch: \`${{ github.ref_name }}\`\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Workflow>"
-          curl -s -X POST \
-            -H "Authorization: Bearer ${{ secrets.SLACK_TOKEN }}" \
-            -H "Content-type: application/json; charset=utf-8" \
-            --data "{\"channel\":\"${{ secrets.SLACK_CHANNEL_ID }}\",\"text\":\"$msg\",\"thread_ts\":\"$ts\"}" \
-            https://slack.com/api/chat.postMessage

.github/workflows/pre-commit.yml DELETED Viewed

@@ -1,30 +0,0 @@
-name: pre-commit
-on:
-  pull_request:
-  push:
-    branches: [ main, master ]
-jobs:
-  run-pre-commit:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      pull-requests: read
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Cache pre-commit
-        uses: actions/cache@v4
-        with:
-          path: ~/.cache/pre-commit
-          key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}
-          restore-keys: |
-            pre-commit-${{ runner.os }}-
-      - name: Run pre-commit
-        uses: pre-commit/action@v3.0.1

.github/workflows/push-to-hf.yml DELETED Viewed

@@ -1,40 +0,0 @@
-name: Push to HF Repo
-on:
-  push:
-    branches:
-      - main
-  workflow_dispatch:
-jobs:
-  push_to_hf:
-    runs-on: ubuntu-latest
-    steps:
-      # 1. Checkout the repo
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - name: Install Git LFS
-        run: |
-          git lfs install
-          git lfs fetch --all
-          git lfs pull
-      # 2. Set up Git
-      - name: Configure Git
-        run: |
-          git config user.name "MotifTech"
-          git config user.email "huggingface@motiftech.io"
-      # 3. Add HF remote
-      - name: Add Hugging Face remote
-        run: |
-          git remote add hf https://huggingface.co/Motif-Technologies/optimizer
-          git fetch hf || true
-      # 4. Push to HF repo
-      - name: Push to Hugging Face
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          git push "https://hf_token:${HF_TOKEN}@huggingface.co/Motif-Technologies/optimizer" HEAD:main

.gitignore DELETED Viewed

@@ -1,21 +0,0 @@
-__pycache__
-.idea
-.DS_Store
-*.egg-info
-outputs
-dist/*
-.vscode
-# data
-data
-out
-wandb
-torchtitan/datasets/**/*.model
-torchtitan/experiments/flux/assets/*
-# temp files
-*.log
-error.json
-_remote_module_non_scriptable.py
-.git_disabled/

.pre-commit-config.yaml DELETED Viewed

@@ -1,33 +0,0 @@
-default_install_hook_types:
-  - pre-commit
-  - commit-msg
-default_stages:
-  - pre-commit # Run locally
-  - manual # Run in CI
-exclude: '(build|result)/.*|__pycache__/.*|.*\.(png|html)$'
-repos:
-- repo: https://github.com/google/yapf
-  rev: v0.43.0
-  hooks:
-  - id: yapf
-    args: [--in-place, --verbose]
-- repo: https://github.com/crate-ci/typos
-  rev: v1.34.0
-  hooks:
-  - id: typos
-    exclude: '.gitattributes'
-- repo: https://github.com/PyCQA/isort
-  rev: 6.0.1
-  hooks:
-  - id: isort
-- repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v20.1.3
-  hooks:
-  - id: clang-format
-    types_or: [c++, cuda]
-    args: [--style=file, --verbose]
-- repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.9.29
-  hooks:
-  - id: pymarkdown
-    args: [fix]

CLAUDE.md DELETED Viewed

@@ -1,108 +0,0 @@
-# CLAUDE.md
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-## Project Overview
-Optimizer is a PyTorch package implementing the **Muon optimizer** with support for N-D sharding parallelism for large-scale distributed training. Based on the paper at https://arxiv.org/abs/2511.07464. It supports general N-D sharding configurations (FSDP2 through hybrid setups like 2 TP + 2 DP-Replicate + 2 DP-Shard).
-## Commands
-### Lint & Format
-```bash
-pre-commit run --all-files          # Run all pre-commit hooks
-pre-commit run isort --all-files    # Run a specific hook (e.g., isort)
-```
-Hooks: yapf (Python formatter), isort (import sorter), typos (spell checker), clang-format (C++/CUDA), pymarkdown (Markdown linter), actionlint (GitHub Actions).
-### Tests
-Tests require **8 GPUs**, access to `Motif-Technologies/Motif-2.6B-4layer-random` on HuggingFace (`HF_TOKEN` env var), and PyTorch >= 2.8.0.
-```bash
-cd test && ./run_test.sh
-# Equivalent to:
-cd test && torchrun --nproc-per-node=8 --local-ranks-filter=0 -m pytest test_muon.py
-```
-Useful pytest flags: `--measure-perf` (timing/memory), `--do-profile` (profiling, requires `--measure-perf`), `--skip-verify` (skip correctness check against sequential implementation).
-### Build
-Uses kernel-builder infrastructure (`build.toml`, `flake.nix`). Pre-built binaries for various PyTorch/CUDA/ROCm combinations are stored in `build/`.
-### Commit Convention
-**Always append `[skip-build]` to every commit message.** This prevents CI from triggering unnecessary build jobs on development branches.
-## Architecture
-### Source Layout
-```
-torch-ext/optimizer/
-├── __init__.py                    # Public API: exports Muon
-├── muon.py                        # Muon optimizer class (~430 lines)
-├── newton_schulz.py               # Newton-Schulz iteration (~50 lines)
-├── qk_clip.py                     # QK clipping for attention heads (~130 lines)
-├── core.py                        # Shared state, helpers, param grouping (~110 lines)
-├── pipeline.py                    # Async generator pipeline for parallel mode (~290 lines)
-├── async_utils.py                 # AsyncTask / AsyncRuntime scheduling (~75 lines)
-├── adamw.py                       # Fused AdamW for non-Muon parameters (~160 lines)
-├── matmul_transpose_triton.py     # Triton kernel for X @ X.T (~130 lines)
-└── distributed/
-    └── utils.py                   # Shard mesh construction, DTensor slicing (~175 lines)
-```
-### Optimizer Modes
-The `Muon` optimizer has three execution paths selected per-parameter based on its tensor type and mesh structure:
-1. **Base mode** (`base()`) — Single-device / non-sharded tensors. Standard Muon with Newton-Schulz orthogonalization.
-2. **Distributed mode** (`distributed_muon()`) — Gathers full tensors via all-gather, computes updates, redistributes. Used for small parameters or fallback.
-3. **Parallel mode** (`parallel()`) — Pipelined all2all communication overlapped with compute. Uses an async generator pipeline scheduled by `run_pipeline()`. This is the main advanced feature.
-### Parallel Mode Pipeline
-The parallel pipeline is implemented as a single generator function `muon_chunk_pipeline()` in `pipeline.py`. Parameters are split into chunks, and each chunk flows through:
-```
-build bufs + async all2all_gather → yield → wait + Newton-Schulz compute + async all2all_scatter → yield → wait + update_param
-```
-The generator yields 2 times (after launching async gather and async scatter via `async_op=True`), allowing `run_pipeline()` to interleave multiple chunks for communication overlap. `work.wait()` completes each async operation after the yield.
-`warmup_step` maps to `max_concurrent_tasks = warmup_step + 1` in `run_pipeline()`.
-For detailed implementation documentation (pipeline internals, distributed utilities, QK clipping with strided sharding, etc.), see [`docs/implementation.md`](docs/implementation.md).
-### Key Abstractions
-- **`get_default_muon_param_groups(model, is_muon_func)`** (`core.py`) — Separates parameters into Muon-optimizable (2D+) and AdamW groups. Skips embeddings and output layers by default.
-- **`_muon_state` dataclass** (`core.py`) — Per-parameter config: rank ownership (`worker_rank`), process group, precomputed shard indices (`rank_indices`, `rank_numels`), and optional QK clip state. Config-only; no transient pipeline state.
-- **`muon_chunk_pipeline()` generator** (`pipeline.py`) — Processes one chunk through the full gather→compute→scatter→update pipeline. Uses `async_op=True` for non-blocking all-to-all and yields to allow chunk interleaving. All intermediate buffers are generator-local variables.
-- **`run_pipeline()`** (`async_utils.py`) — Generator-based pipeline scheduling with bounded concurrency. Interleaves multiple chunk pipelines at yield points.
-- **`construct_shard_mesh()` / `get_slices_of_dtensor()`** (`distributed/utils.py`) — Utilities for building shard meshes from DTensor placements and computing per-rank local slices. Handles both `Shard` and `_StridedShard` (PyTorch 2.10+).
-- **Newton-Schulz iteration** (`newton_schulz.py`) — `_zeropower_via_newtonschulz5()`: 5 quintic iterations in bfloat16 with pre-optimized coefficients for gradient orthogonalization. Uses Triton kernel `matmul_transpose_assign` for efficient X @ X.T.
-- **QK Clipping** (`qk_clip.py`) — Optional dynamic clipping of attention head projections when QK logits exceed a threshold. Configured via `q_indices`, `k_indices`, `head_dim`, `threshold`.
-- **Fused AdamW** (`adamw.py`) — Uses PyTorch's `torch._fused_adamw_` for non-Muon parameters, grouping tensors by device/dtype and DTensor placement.
-### Dependency Graph
-```
-matmul_transpose_triton.py       (leaf)
-         │
-    newton_schulz.py              (leaf + triton)
-         │
-      core.py ──── qk_clip.py    (leaf, distributed/utils)
-       │    │         │
-       │  pipeline.py ─── async_utils.py
-       │       │
-       │   adamw.py
-       │       │
-      muon.py                     (all above)
-         │
-    __init__.py
-```

README.md CHANGED Viewed

@@ -1,7 +1,6 @@
 ---
 tags:
-- kernels
-license: apache-2.0
 ---
 # Optimizer
@@ -10,14 +9,8 @@ Optimizer is a python package that provides:
 - PyTorch implementation of recent optimizer algorithms
 - with support for parallelism techniques for efficient large-scale training.
-## Currently implemented
-- Parallel Muon with N-D sharding
-  - [arxiv URL](https://arxiv.org/abs/2511.07464)
-  - Supports **general N-D sharding configurations**
-    - The implementation is not tied to any specific parallel strategy.
-    - Verified from basic FSDP2 setups up to hybrid configurations such as
-      **(2 TP + 2 DP-Replicate + 2 DP-Shard)**.
-    - Verified configurations can be found in [test_muon.py](./test/test_muon.py)
 ## Usage
@@ -27,78 +20,14 @@ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from kernels import get_kernel
 optimizer = get_kernel("motif-technologies/optimizer")
-get_default_muon_param_groups = optimizer.muon.get_default_muon_param_groups
 model = None # your model here
 fsdp_model = FSDP(model)
-# muon, in nature, cannot use 1-d tensor
-# we provide helper function to group such tensors
-# you can use your own function, if necessary
-params = get_default_muon_param_groups(model) # user can write own is_muon_func, if necessary
 optim = optimizer.Muon(
-    params,
     lr=0.01,
     momentum=0.9,
     weight_decay=1e-4,
 )
 ```
-## Documentation
-- [Implementation Guide](./docs/implementation.md) — Detailed walkthrough of the internal architecture, parallel pipeline, distributed utilities, and QK clipping. Recommended for code reviewers and new contributors.
-- [PyTorch 2.10 TP Fix](./docs/pytorch-2.10-tp-fix.md) — Root cause analysis and fixes for `_StridedShard` compatibility with PyTorch 2.10+.
-## Test
-- Check [test/README.md](./test/README.md) for how to run the tests.
-## Pre-commit Hooks
-This project uses [pre-commit](https://pre-commit.com/) to automatically check and format code before commits.
-### Setup
-1. Install pre-commit:
-   ```bash
-   pip install pre-commit
-   ```
-2. Install the git hooks:
-```bash
-   pre-commit install
-   ```
-Once installed, the configured hooks will run automatically on each commit.
-### Included Hooks
-The following tools are run via pre-commit:
-- **[yapf](https://github.com/google/yapf)** – Python code formatter
-- **[typos](https://github.com/crate-ci/typos)** – Spell checker for common typos
-- **[isort](https://github.com/PyCQA/isort)** – Organizes and sorts Python imports
-- **[clang-format](https://clang.llvm.org/docs/ClangFormat.html)** – Formats C++/CUDA code (`--style=file`)
-- **[pymarkdown](https://github.com/jackdewinter/pymarkdown)** – Lints and auto-fixes Markdown files
-- **[actionlint](https://github.com/rhysd/actionlint)** – Validates GitHub Actions workflows
-### Usage
-- Run all checks on the entire codebase:
-   ```bash
-   pre-commit run --all-files
-   ```
-- Run a specific hook (example: isort):
- ```bash
-   pre-commit run isort --all-files
-   ```
-### Test
-- There is a [simple unittest for Parallel Muon](./test/test_muon/README.md)

 ---
 tags:
+- kernel
 ---
 # Optimizer
 - PyTorch implementation of recent optimizer algorithms
 - with support for parallelism techniques for efficient large-scale training.
+### Currently implemented
+- [Parallel Muon with FSDP2](./docs/muon/parallel_muon.pdf)
 ## Usage
 from kernels import get_kernel
 optimizer = get_kernel("motif-technologies/optimizer")
 model = None # your model here
 fsdp_model = FSDP(model)
 optim = optimizer.Muon(
+    fsdp_model.parameters(),
     lr=0.01,
     momentum=0.9,
     weight_decay=1e-4,
 )
 ```

build.toml CHANGED Viewed

@@ -1,33 +1,23 @@
 [general]
 name = "optimizer"
-backends = [
-    "cuda",
-    "rocm",
-]
 [torch]
 src = [
-    "torch-ext/torch_binding.cpp",
-    "torch-ext/torch_binding.h",
 ]
-[kernel.optimizer]
-backend = "cuda"
-depends = ["torch"]
-src = ["optimizer/dummy.cu"]
-[kernel.optimizer_rocm]
 backend = "rocm"
-rocm-archs = [
-    "gfx906",
-    "gfx908",
-    "gfx90a",
-    "gfx940",
-    "gfx941",
-    "gfx942",
-    "gfx1030",
-    "gfx1100",
-    "gfx1101",
 ]
-depends = ["torch"]
-src = ["optimizer/dummy.cu"]

 [general]
 name = "optimizer"
+universal = false
 [torch]
 src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h",
 ]
+[kernel.activation]
 backend = "rocm"
+src = [
+  "optimizer/dummy.cu",
+]
+depends = [ "torch" ]
+[kernel.activation_cuda]
+backend = "cuda"
+src = [
+  "optimizer/dummy.cu",
 ]
+depends = [ "torch" ]

build/torch210-cxx11-cu126-x86_64-linux/adamw.py DELETED Viewed

@@ -1,154 +0,0 @@
-from collections import defaultdict
-from typing import cast
-import torch
-from torch.distributed.tensor import DTensor
-def fused_adamw(
-    params: list[torch.Tensor],
-    grads: list[torch.Tensor],
-    exp_avgs: list[torch.Tensor],
-    exp_avg_sqs: list[torch.Tensor],
-    max_exp_avg_sqs: list[torch.Tensor],
-    state_steps: list[torch.Tensor],
-    amsgrad: bool,
-    beta1: float,
-    beta2: float,
-    lr: float | torch.Tensor,
-    weight_decay: float,
-    eps: float,
-    maximize: bool,
-) -> None:
-    if not params:
-        return
-    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-    # treating it as a scalar.
-    lr_dict: dict | None = ({
-        lr.device: lr
-    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
-    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-         state_steps]  # type: ignore[list-item]
-    )
-    for (device, _), (
-        (
-            device_params_,
-            device_grads_,
-            device_exp_avgs_,
-            device_exp_avg_sqs_,
-            device_max_exp_avg_sqs,
-            device_state_steps_,
-        ),
-            _,
-    ) in grouped_tensors.items():
-        device_params = cast(list[torch.Tensor], device_params_)
-        device_grads = cast(list[torch.Tensor], device_grads_)
-        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-        if lr_dict is not None and device not in lr_dict:
-            lr_dict[device] = lr.to(
-                device=device, non_blocking=True)  # type: ignore[union-attr]
-            lr = lr_dict[device]
-        torch._foreach_add_(device_state_steps, 1)
-        func = torch._fused_adamw_
-        func(
-            device_params,
-            device_grads,
-            device_exp_avgs,
-            device_exp_avg_sqs,
-            device_max_exp_avg_sqs,  # type: ignore[arg-type]
-            device_state_steps,
-            amsgrad=amsgrad,
-            lr=lr,  # type: ignore[arg-type]
-            beta1=beta1,
-            beta2=beta2,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=maximize,
-        )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
-    params_with_grads = []
-    grads = []
-    moment1 = []
-    moment2 = []
-    max_exp_avg_sqs = []
-    state_steps = []
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    for p in params:
-        g = p.grad
-        if g is None:
-            continue
-        state = optimizer_state[p]
-        params_with_grads.append(p)
-        grads.append(g)
-        if "step" not in state:
-            state["step"] = (torch.zeros((),
-                                         dtype=torch.float32,
-                                         device=p.device))
-            state["moment1"] = torch.zeros_like(g)
-            state["moment2"] = torch.zeros_like(g)
-        moment1.append(state["moment1"])
-        moment2.append(state["moment2"])
-        if not isinstance(state["step"], torch.Tensor):
-            step_tensor = torch.tensor(state["step"],
-                                       dtype=torch.float32,
-                                       device=p.device)
-        else:
-            step_tensor = state["step"]
-        state_steps.append(step_tensor)
-    fused_adamw(
-        params_with_grads,
-        grads,
-        moment1,
-        moment2,
-        max_exp_avg_sqs,
-        state_steps,
-        amsgrad=False,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        eps=eps,
-        maximize=False,
-    )
-def step_adamw(optimizer_state, group):
-    """Dispatch AdamW step, grouping parameters by type and placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        group: Parameter group dict.
-    """
-    params = group["params"]
-    # group params with its type and placement
-    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
-    for p in params:
-        match p:
-            case DTensor():
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])].append(p)
-            case torch.Tensor():
-                placement_to_params[tuple([torch.Tensor, None])].append(p)
-    for group_params in placement_to_params.values():
-        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu126-x86_64-linux/async_utils.py DELETED Viewed

@@ -1,77 +0,0 @@
-import logging
-from typing import Generator
-logger = logging.getLogger(__name__)
-class _Task:
-    """Internal: wraps a generator, advances one yield at a time."""
-    def __init__(self, generator: Generator[None, None, None], index: int):
-        self._generator = generator
-        self._index = index
-        self._steps_completed = 0
-        self.step()  # run to first yield
-    def step(self) -> bool:
-        try:
-            next(self._generator)
-            self._steps_completed += 1
-            logger.debug("pipeline[%d] completed stage %d", self._index,
-                         self._steps_completed)
-            return True
-        except StopIteration:
-            logger.debug("pipeline[%d] finished after %d stages", self._index,
-                         self._steps_completed)
-            return False
-    def close(self):
-        self._generator.close()
-def run_pipeline(
-    pipelines: Generator[Generator[None, None, None], None, None],
-    max_concurrent: int,
-) -> None:
-    """Run generator-based pipelines with bounded concurrency.
-    Each pipeline is a generator that yields at stage boundaries.
-    The runtime interleaves pipelines so communication and computation
-    overlap across chunks.
-    """
-    if max_concurrent <= 0:
-        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
-    have_new = True
-    task_index = 0
-    previous_tasks: list[_Task] = []
-    try:
-        while have_new or previous_tasks:
-            running_tasks: list[_Task] = []
-            # Admit one new pipeline per iteration (staggered admission).
-            # Admitting one at a time ensures that while chunk N does NS
-            # compute on the default stream, chunk N+1's NCCL all-to-all
-            # runs concurrently on the NCCL stream — creating real
-            # communication/computation overlap on the GPU.
-            if have_new and len(previous_tasks) < max_concurrent:
-                try:
-                    gen = next(pipelines)
-                    task = _Task(gen, task_index)
-                    task_index += 1
-                    running_tasks.append(task)
-                except StopIteration:
-                    have_new = False
-            # Advance every previously-yielded task by one step.
-            for task in previous_tasks:
-                if task.step():
-                    running_tasks.append(task)
-            previous_tasks = running_tasks
-    except BaseException:
-        # Clean up all in-flight generators to release GPU resources.
-        for task in previous_tasks:
-            task.close()
-        raise

build/torch210-cxx11-cu126-x86_64-linux/core.py DELETED Viewed

@@ -1,116 +0,0 @@
-import math
-from dataclasses import dataclass
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.tensor import DTensor
-@dataclass
-class _muon_state:
-    worker_rank: int
-    process_group: ProcessGroup
-    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
-    rank_numels: dict[int, int]  # local_rank -> numel
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-def update_g(optimizer_state, p, g, group, momentum):
-    """Apply momentum update to gradient.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        p: Parameter tensor.
-        g: Gradient tensor.
-        group: Parameter group dict.
-        momentum: Momentum coefficient.
-    Returns:
-        Momentum-updated gradient tensor.
-    """
-    state = optimizer_state[p]
-    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-    torch.add(g, buf, alpha=momentum, out=buf)
-    if group["nesterov"]:
-        g.add_(buf, alpha=momentum)
-        return g
-    return buf
-def update_p(p, u, lr, adjusted_lr, weight_decay):
-    """Apply weight decay and orthogonalized update to parameter.
-    Args:
-        p: Parameter (torch.nn.Parameter or DTensor).
-        u: Orthogonalized update tensor.
-        lr: Base learning rate.
-        adjusted_lr: Size-adjusted learning rate.
-        weight_decay: Weight decay coefficient.
-    """
-    if isinstance(p, torch.nn.Parameter):
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    else:
-        p.mul_(1 - lr * weight_decay)
-        p.add_(u, alpha=-adjusted_lr)
-def adjust_lr_for_muon(lr, param_shape):
-    """Scale learning rate based on parameter matrix dimensions.
-    Args:
-        lr: Base learning rate.
-        param_shape: Shape of the parameter tensor.
-    Returns:
-        Adjusted learning rate.
-    """
-    A, B = param_shape[:2]
-    # We adjust the learning rate and weight decay based on the size of the parameter matrix
-    # as described in the paper
-    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-    adjusted_lr = lr * adjusted_ratio
-    return adjusted_lr
-def default_is_muon(name, x, expert_keys=None):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    if any(key in name for key in skip_keys):
-        return False
-    effective_ndim = x.ndim
-    if expert_keys and any(key in name for key in expert_keys):
-        effective_ndim -= 1
-    return effective_ndim >= 2
-def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
-    if is_muon_func is None:
-        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
-    muon_params, muon_names = [], []
-    non_muon_params = []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
-            continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]

build/torch210-cxx11-cu126-x86_64-linux/distributed/utils.py DELETED Viewed

@@ -1,234 +0,0 @@
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import DTensor
-from torch.distributed.tensor.placement_types import (Placement, Shard,
-                                                      _StridedShard)
-def _is_shard(placement: Placement) -> bool:
-    """Check if a placement is a shard type (Shard or _StridedShard).
-    In PyTorch 2.10+, _StridedShard no longer inherits from Shard, so
-    ``placement.is_shard()`` returns False for _StridedShard.  This helper
-    handles both old and new hierarchies.
-    """
-    return isinstance(placement, (Shard, _StridedShard))
-def get_slices_of_dtensor(
-    target: DTensor | torch.Tensor,
-    local_rank: int,
-    shard_mesh: DeviceMesh,
-    shard_placements: tuple[Placement],
-) -> tuple[slice | torch.Tensor, ...]:
-    """
-    Get per-dimension indices for a given rank's shard of the target tensor.
-    Uses ``Shard.local_shard_size_and_offset`` and
-    ``_StridedShard.local_shard_size_and_offset`` for correct handling of
-    both contiguous and strided (non-contiguous) sharding.
-    Args:
-        target (DTensor | torch.Tensor): The target tensor (for its shape).
-        local_rank (int): The local rank within the shard group.
-        shard_mesh (DeviceMesh): The shard mesh (only shard dimensions).
-        shard_placements (tuple[Placement]): The shard placements.
-    Returns:
-        A tuple of indices (one per tensor dim).  Each element is either:
-        - A ``slice`` (for contiguous or unsharded dims)
-        - A 1-D ``torch.LongTensor`` of indices (for strided sharding)
-    """
-    # find the global rank of the local rank in the shard mesh
-    rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
-    rank_coords = (shard_mesh.mesh == rank).nonzero()
-    assert len(rank_coords) == 1
-    rank_coords = tuple(rank_coords[0].tolist())
-    assert len(rank_coords) == len(shard_placements)
-    # Track per-shard-dim indices.
-    # None means "not yet sharded on this dim".
-    dim_indices: dict[int, torch.Tensor] = {}
-    # Caution: Assuming replicate-to-shard of the shard mesh goes with
-    # left-to-right sharding. This is ensured by the sorting logic of
-    # construct_shard_mesh function.
-    for mesh_dim_idx, (rank_coord, placement) in enumerate(
-            zip(rank_coords, shard_placements)):
-        assert _is_shard(placement)
-        num_chunks = shard_mesh.mesh.shape[mesh_dim_idx]
-        shard_dim = placement.dim
-        # Current effective size on this dim (may already be sub-sharded)
-        if shard_dim in dim_indices:
-            curr_size = len(dim_indices[shard_dim])
-        else:
-            curr_size = target.size()[shard_dim]
-        if curr_size % num_chunks != 0:
-            raise NotImplementedError(
-                f"Dimension size {curr_size} is not divisible "
-                f"by number of ranks {num_chunks} for shard "
-                f"placement on dim {shard_dim}. (shape: {target.shape})")
-        # Compute indices for this level of sharding
-        if isinstance(placement, _StridedShard):
-            _shard_size, offsets = _StridedShard.local_shard_size_and_offset(
-                placement,
-                curr_size,
-                num_chunks,
-                rank_coord,
-                return_first_offset=False)
-            new_indices = torch.tensor(offsets, dtype=torch.long)
-        else:
-            shard_size, offset = Shard.local_shard_size_and_offset(
-                curr_size, num_chunks, rank_coord)
-            new_indices = torch.arange(offset,
-                                       offset + shard_size,
-                                       dtype=torch.long)
-        # Compose with previous indices on this dim
-        if shard_dim in dim_indices:
-            dim_indices[shard_dim] = dim_indices[shard_dim][new_indices]
-        else:
-            dim_indices[shard_dim] = new_indices
-    # Build result tuple
-    result: list[slice | torch.Tensor] = []
-    for d in range(len(target.size())):
-        if d not in dim_indices:
-            result.append(slice(None))
-        else:
-            indices = dim_indices[d]
-            # Convert contiguous indices to slice for efficiency
-            if len(indices) > 0:
-                start = indices[0].item()
-                expected = torch.arange(start,
-                                        start + len(indices),
-                                        dtype=torch.long)
-                if torch.equal(indices, expected):
-                    result.append(slice(start, start + len(indices)))
-                else:
-                    result.append(indices)
-            else:
-                result.append(slice(0, 0))
-    return tuple(result)
-_ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
-                                                  ProcessGroup]] = dict()
-def construct_shard_mesh(
-    placements: tuple[Placement],
-    mesh: DeviceMesh,
-) -> tuple[DeviceMesh, ProcessGroup, tuple[Placement, ...]]:
-    """Construct shard sub-mesh and ProcessGroup for all-to-all communication.
-    Given a DTensor's placements and device mesh, extracts the "shard group"
-    — the set of ranks that together hold all shards of the same replica —
-    and creates a ProcessGroup for all-to-all among them.
-    Steps:
-        1. Sort placements: Replicate first, then Shard by (dim, granularity).
-        2. Permute the mesh tensor to match the sorted order.
-        3. Collapse Replicate dims → list of shard sub-meshes (one per replica).
-        4. Create/retrieve a cached ProcessGroup for the current rank's sub-mesh.
-    Example — 8 GPUs, mesh shape (2, 2, 2),
-              placements ``[Shard(0), Replicate, _StridedShard(0)]``::
-        Step 1 — Sort: [Replicate, _StridedShard(0), Shard(0)]
-                 Permutation: [1, 2, 0]
-        Step 2 — Permute mesh dims by [1, 2, 0]:
-                 Original:                Permuted:
-                 [[[0,1],[2,3]],          [[[0,2],[1,3]],
-                  [[4,5],[6,7]]]           [[4,6],[5,7]]]
-        Step 3 — Unbind replicate dim (dim 0), giving 2 shard sub-meshes:
-                 sub-mesh 0 = [[0,2],[1,3]]  (replica group 0)
-                 sub-mesh 1 = [[4,6],[5,7]]  (replica group 1)
-                 shard_placements = (_StridedShard(0), Shard(0))
-        Step 4 — Rank 0 → ProcessGroup([0,1,4,5])
-                 Rank 2 → ProcessGroup([2,3,6,7])
-    Returns:
-        ``(shard_mesh, process_group, shard_placements)``
-    """
-    my_rank = dist.get_rank()
-    assert mesh.mesh.device.type == 'cpu'
-    # -- Fast path: 1D all-shard mesh → reuse existing PG. ----------------
-    # This avoids a non-collective dist.new_group() call, which would
-    # deadlock when only a subset of ranks call this function (e.g. expert
-    # DTensors on a TP submesh where ranks 0-3 and 4-7 call separately).
-    if mesh.ndim == 1 and len(placements) == 1 and _is_shard(placements[0]):
-        key = (*mesh.mesh.shape, *mesh.mesh.flatten().tolist())
-        if key not in _ranks_to_dist_cache:
-            _ranks_to_dist_cache[key] = (mesh, mesh.get_group())
-        return (*_ranks_to_dist_cache[key], tuple(placements))
-    mesh_tensor = mesh.mesh.clone()
-    # -- Step 1: Sort placements (Replicate first, then Shard by dim). ------
-    # _StridedShard comes BEFORE regular Shard on the same dim so that
-    # get_slices_of_dtensor applies the outer sharding first, matching
-    # DTensor's left-to-right (outer-to-inner) composition order.
-    def _sort_key(item):
-        index, placement = item
-        assert not placement.is_partial(), "Partial placement not supported"
-        if placement.is_replicate():
-            return (-1, 0, index)
-        assert _is_shard(placement), f"Unsupported: {type(placement)}"
-        split = (-1 / placement.split_factor if isinstance(
-            placement, _StridedShard) else 0)
-        return (placement.dim, split, index)
-    indexed = sorted(enumerate(placements), key=_sort_key)
-    perm, sorted_placements = zip(*indexed)
-    # -- Step 2: Permute mesh to match sorted placement order. --------------
-    sorted_mesh = mesh_tensor.permute(perm)
-    # -- Step 3: Collapse replicate dims → list of shard sub-meshes. --------
-    # E.g. mesh (2, 3, 4, 4) with [R, R, S(0), S(1)] → 6 sub-meshes of (4, 4)
-    num_rep = sum(1 for p in sorted_placements if p.is_replicate())
-    if num_rep > 0:
-        if num_rep > 1:
-            sorted_mesh = sorted_mesh.flatten(0, num_rep - 1)
-        shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
-    else:
-        shard_meshes = [sorted_mesh]
-    shard_placements = sorted_placements[num_rep:]
-    assert len(shard_placements) == len(set(shard_placements))
-    # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
-    # All ranks must call dist.new_group in the same order, even though each
-    # rank only joins one group.
-    def _cache_key(t: torch.Tensor) -> tuple:
-        return (*t.shape, *t.flatten().tolist())
-    my_key = None
-    for sm in shard_meshes:
-        key = _cache_key(sm)
-        if (my_rank == sm).any().item():
-            assert my_key is None, "Rank appears in multiple shard groups"
-            my_key = key
-        if key not in _ranks_to_dist_cache:
-            pg = dist.new_group(sm.flatten().tolist())
-            _ranks_to_dist_cache[key] = (
-                DeviceMesh(device_type="cuda", mesh=sm),
-                pg,
-            )
-    return (*_ranks_to_dist_cache[my_key], shard_placements)

build/torch210-cxx11-cu126-x86_64-linux/matmul_transpose_triton.py DELETED Viewed

@@ -1,121 +0,0 @@
-# MIT License
-#
-# Copyright (c) 2025 Tianyang Lin
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import torch
-import triton
-import triton.language as tl
-def get_autotune_config():
-    return [
-        triton.Config(
-            {
-                'BLOCK_SIZE_M': blk_m,
-                'BLOCK_SIZE_K': blk_k,
-                'GROUP_SIZE_M': grp_sz
-            },
-            num_stages=n_stages,
-            num_warps=n_warps) for blk_m in [32, 64, 128]
-        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
-        for n_warps in [4, 8]
-    ]
-@triton.autotune(
-    configs=get_autotune_config(),
-    key=['M', 'K'],
-)
-@triton.jit
-def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
-               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-               GROUP_SIZE_M: tl.constexpr):
-    """
-    Core kernel jit function of matmul_transpose that computes y = x @ x.T
-    The code is a simple adaptation from the triton `matmul` tutorial:
-    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
-    """
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    if pid_m > pid_n:
-        return
-    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    # we use a & b ptrs to denote different rows of x.
-    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl.load(a_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
-        a_ptrs += BLOCK_SIZE_K * stride_xk
-        b_ptrs += BLOCK_SIZE_K * stride_xk
-    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
-    # https://github.com/triton-lang/triton/issues/2252
-    c = accumulator.to(x.dtype.element_ty)
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
-    tl.store(c_ptrs, c, mask=c_mask)
-    # transpose and copy
-    if pid_m < pid_n:
-        ct_ptrs = y + stride_ym * offs_cn[:,
-                                          None] + stride_yn * offs_cm[None, :]
-        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
-        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-def matmul_transpose_assign(d_in, d_out):
-    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
-    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
-    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
-    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
-    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
-    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
-    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
-            "First dimension of `d_in` must match first and second dimension of `d_out`"
-    d_in = d_in.contiguous()
-    M, K = d_in.shape
-    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
-        M, META['BLOCK_SIZE_M']), )
-    with torch.cuda.device(d_in.device.index):
-        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
-                         d_out.stride(0), d_out.stride(1))

build/torch210-cxx11-cu126-x86_64-linux/metadata.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "python-depends": []
-}

build/torch210-cxx11-cu126-x86_64-linux/muon.py DELETED Viewed

@@ -1,594 +0,0 @@
-import logging
-import types
-from collections import defaultdict
-from typing import Any
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor, Replicate, Shard
-from torch.profiler import record_function
-from .adamw import step_adamw
-from .async_utils import run_pipeline
-from .core import (_muon_state, adjust_lr_for_muon,
-                   get_default_muon_param_groups, update_g, update_p)
-from .distributed.utils import (_is_shard, construct_shard_mesh,
-                                get_slices_of_dtensor)
-from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
-                            _zeropower_via_newtonschulz5)
-from .pipeline import muon_chunk_pipeline
-from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
-logger = logging.getLogger(__name__)
-def _expand_expert_params(names, params, expert_keys):
-    """Expand expert params by splitting on dim 0 (expert dimension).
-    Params whose name matches any key in ``expert_keys`` are treated as
-    expert-parallel tensors.  Their outermost dimension is the expert
-    dimension: an ``(E, out, in)`` tensor becomes ``E`` separate 2D
-    ``nn.Parameter`` views so that in-place updates propagate back to
-    the original storage.
-    Non-expert params with ``ndim > 2`` trigger an ``AssertionError`` —
-    if they are expert params, their key must be added to ``expert_keys``.
-    The grad must already be set on each expert param (e.g. after momentum).
-    For DTensor expert params, placements that shard on dim 0 (expert dim)
-    are consumed by the split.  Non-dim-0 shard placements (e.g. TP) are
-    preserved: each 2D slice is wrapped as a DTensor on the corresponding
-    submesh so the parallel pipeline handles the TP communication.
-    """
-    expanded_names = []
-    expanded_params = []
-    for n, p in zip(names, params):
-        is_expert = expert_keys and any(key in n for key in expert_keys)
-        is_dtensor = isinstance(p.data, DTensor)
-        if not is_expert:
-            assert p.data.ndim <= 2, (
-                f"Param {n} has ndim={p.data.ndim} but does not match "
-                f"expert_keys={expert_keys}. If this is an expert param, "
-                f"add its key to expert_keys.")
-            expanded_names.append(n)
-            expanded_params.append(p)
-            continue
-        g = p.grad
-        assert g is not None, (
-            f"Expert param {n} must have grad set before expansion")
-        tp_mesh = None
-        tp_placements_2d = None
-        if is_dtensor:
-            local_data = p.to_local()
-            local_grad = g.to_local() if isinstance(g, DTensor) else g
-            # Find non-dim-0 shard placements (e.g. TP sharding).
-            # After splitting on dim 0, Shard(k) becomes Shard(k-1).
-            tp_dim_indices = []
-            tp_placements_2d = []
-            for i, pl in enumerate(p.placements):
-                if _is_shard(pl) and pl.dim != 0:
-                    tp_dim_indices.append(i)
-                    tp_placements_2d.append(Shard(pl.dim - 1))
-            if tp_dim_indices:
-                tp_dim_names = tuple(p.device_mesh.mesh_dim_names[i]
-                                     for i in tp_dim_indices)
-                if len(tp_dim_names) == 1:
-                    tp_mesh = p.device_mesh[tp_dim_names[0]]
-                else:
-                    tp_mesh = p.device_mesh[tp_dim_names]
-        else:
-            local_data = p.data
-            local_grad = g
-        # Expand: split dim 0, reshape each slice to 2D.
-        num_local_experts = local_data.shape[0]
-        for i in range(num_local_experts):
-            slice_data = local_data[i]
-            slice_grad = local_grad[i]
-            if tp_mesh is not None:
-                # Wrap as DTensor on TP submesh so the pipeline handles
-                # TP communication (gather/scatter across TP ranks).
-                dt_data = DTensor.from_local(slice_data,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                dt_grad = DTensor.from_local(slice_grad,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                expert_param = torch.nn.Parameter(dt_data, requires_grad=False)
-                expert_param.grad = dt_grad
-            else:
-                expert_param = torch.nn.Parameter(slice_data,
-                                                  requires_grad=False)
-                expert_param.grad = slice_grad
-            expanded_names.append(f"{n}[{i}]")
-            expanded_params.append(expert_param)
-        p.grad = None  # allow expert grad storage to be freed after pipeline
-    return expanded_names, expanded_params
-class Muon(torch.optim.Optimizer):
-    """
-    Muon - MomentUm Orthogonalized by Newton-schulz
-    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
-    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
-    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
-    the advantage that it can be stably run in bfloat16 on the GPU.
-    Some warnings:
-    - We believe this optimizer is unlikely to work well for training with small batch size.
-    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
-    Arguments:
-        model: The model to be optimized by Muon.
-        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
-        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
-        momentum: The momentum used by the internal SGD. (0.95 is a good default)
-        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
-        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        weight_decay: The weight decay for Muon and AdamW.
-            Parameters that are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW instead.
-        adamw_lr: The learning rate for the internal AdamW.
-        adamw_betas: The betas for the internal AdamW.
-        adamw_eps: The epsilon for the internal AdamW.
-        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
-        debug: Whether to print debug information.
-        clip_info : Configuration for QK clipping. Expected keys:
-            - "q_indices" (list[int]): Indices of query heads to consider.
-            - "k_indices" (list[int]): Indices of key heads to consider.
-            - "head_dim" (int): Dimensionality of each attention head.
-            - "threshold" (float): Threshold value; heads whose QK logits exceed
-            this value will be scaled down.
-            Default is:
-                {
-                    "q_indices": [],
-                    "k_indices": [],
-                    "head_dim": 128,
-                    "threshold": 100
-                }
-        warmup_step : How many all2all gather, compute operations are launched in advance
-                      before the corresponding all2all scatter steps begin.
-                      A higher warmup_step increases memory usage but can improve
-                      performance by overlapping communication.
-                      Parallel muon only.
-        chunk_size : Batch size of parameters to process in each
-                     all2all gather/compute/scatter step.
-                     Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
-        use_distributed_muon: Use distributed muon by Liu et al. (2024).
-                              For testing purpose only.
-        small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
-        expert_keys: List of strings to identify expert-parallel parameters.
-                     If any key appears in a parameter's name, its outermost
-                     dimension is treated as the expert dimension and expanded
-                     into per-expert 2D params for Muon.  For example,
-                     ``expert_keys=["experts"]`` matches any param whose name
-                     contains "experts".  3D+ params not matched by any key
-                     will raise an error.
-    """
-    def __init__(self,
-                 params,
-                 lr=1e-3,
-                 momentum=0.95,
-                 nesterov=True,
-                 ns_steps=5,
-                 weight_decay=0.1,
-                 adamw_betas=(0.9, 0.95),
-                 adamw_eps=1e-8,
-                 none_grad=True,
-                 debug=False,
-                 clip_config=None,
-                 warmup_step=5,
-                 chunk_size=-1,
-                 use_distributed_muon=False,
-                 small_param_numel_threshold=65536,
-                 expert_keys=None):
-        defaults = dict(
-            lr=lr,
-            weight_decay=weight_decay,
-            momentum=momentum,
-            nesterov=nesterov,
-            ns_steps=ns_steps,
-            adamw_betas=adamw_betas,
-            adamw_eps=adamw_eps,
-            none_grad=none_grad,
-            use_muon=True,
-        )
-        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
-        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
-        if isinstance(params, types.GeneratorType):
-            raise ValueError(error_message.format(idx=0) + instruction_code)
-        for _idx, param_group in enumerate(params):
-            if param_group.get("use_muon", None) is None:
-                raise ValueError(
-                    error_message.format(idx=_idx) + instruction_code)
-        super().__init__(params, defaults)
-        self.debug = debug
-        self.clip_config = clip_config if clip_config is not None else {
-            "q_indices": [],
-            "k_indices": [],
-            "head_dim": 128,
-            "threshold": 100,
-        }
-        self.warmup_step = warmup_step
-        self.chunk_size = chunk_size
-        self.use_distributed_muon = use_distributed_muon
-        self.small_param_numel_threshold = small_param_numel_threshold
-        self.expert_keys = expert_keys
-    def _calc_flops(self, G, steps):
-        assert len(G.shape) == 2
-        M, N = G.shape
-        if M > N:
-            M, N = N, M
-        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def get_shard_mesh(self, p):
-        """
-        Get the shard mesh for a parameter p on the given rank.
-        """
-        assert isinstance(
-            p, DTensor), "Parallel Muon only supports DTensor parameters."
-        shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
-            p.placements, p.device_mesh)
-        return shard_mesh, shard_pg, shard_placements
-    def init_state_and_assign_params(self, names, params, group, qk_logits):
-        param_to_state = {}
-        param_to_flops = {}
-        total_flops = 0
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            assert g.ndim == 2, "Muon only supports 2D parameters."
-            flops = self._calc_flops(g, group["ns_steps"])
-            param_to_flops[id(p)] = flops
-            total_flops += flops
-        if self.debug:
-            logger.debug("Total TFLOPs for Muon: %.2f TFLOPs",
-                         total_flops / 1e12)
-        paired = list(zip(names, params))
-        paired_sorted = sorted(paired,
-                               key=lambda x: param_to_flops[id(x[1])],
-                               reverse=True)
-        names_sorted, params_sorted = zip(*paired_sorted)
-        ordered_names = list(names_sorted)
-        ordered_params = list(params_sorted)
-        round_robin = 0
-        mesh = ordered_params[0].device_mesh
-        placements = ordered_params[0].placements
-        shard_mesh, shard_pg, shard_placements = self.get_shard_mesh(
-            ordered_params[0])
-        shard_mesh_flattened = shard_mesh.mesh.flatten()
-        num_ranks = dist.get_world_size(group=shard_pg)
-        for n, p in zip(ordered_names, ordered_params):
-            if mesh != p.device_mesh:
-                raise ValueError("All parameters must be on the same mesh.")
-            if placements != p.placements:
-                raise ValueError("All parameters must have same placements.")
-            worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
-            round_robin = (round_robin + 1) % len(shard_mesh_flattened)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            # Precompute per-rank indices and numels for all-to-all.
-            rank_indices: dict[int, tuple] = {}
-            rank_numels: dict[int, int] = {}
-            for r in range(num_ranks):
-                indices = get_slices_of_dtensor(p, r, shard_mesh,
-                                                shard_placements)
-                rank_indices[r] = indices
-                numel = 1
-                for idx, dim_size in zip(indices, p.shape):
-                    if isinstance(idx, slice):
-                        start, stop, step = idx.indices(dim_size)
-                        numel *= max(0, (stop - start + (step - 1)) // step)
-                    else:
-                        numel *= len(idx)
-                rank_numels[r] = numel
-            param_to_state[id(p)] = _muon_state(
-                worker_rank=worker_rank,
-                process_group=shard_pg,
-                rank_indices=rank_indices,
-                rank_numels=rank_numels,
-                name=n,
-                qk_clip_state=qk_clip_state,
-            )
-        return param_to_state, ordered_params
-    def base(self, names, params, group, lr, weight_decay, qk_logits):
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
-                                             steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-            update_p(p, u, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p, scales_full, qk_clip_state.head_dim)
-    def distributed_muon(
-        self,
-        names: list[str],
-        params: list[torch.nn.Parameter],
-        group: dict[str, Any],
-        lr: float,
-        weight_decay: float,
-        qk_logits: list[torch.Tensor | DTensor] | None,
-    ):
-        """ Implementation of Distributed Muon by Liu et al. """
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            # Gather G
-            if isinstance(p.data, DTensor):
-                g_full = g.full_tensor()
-                p_full = p.data.full_tensor()
-            else:
-                g_full = g
-                p_full = p
-            u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
-                                                  steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
-            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p_full, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
-            if isinstance(p.data, DTensor):
-                ndims = len(p.device_mesh.mesh.shape)
-                p_replicate = DTensor.from_local(
-                    p_full,
-                    device_mesh=p.device_mesh,
-                    placements=[Replicate() for _ in range(ndims)],
-                )
-                p_sharded = p_replicate.redistribute(
-                    device_mesh=p.device_mesh,
-                    placements=p.placements,
-                )
-                p.copy_(p_sharded)
-    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
-        """
-        Perform a parallel optimization step using Muon.
-        Parameters are chunked and each chunk is processed by a
-        :func:`muon_chunk_pipeline` generator.  :func:`run_pipeline`
-        interleaves multiple chunks so that communication and computation
-        overlap across chunks (the same overlap previously achieved by the
-        warmup + main-loop index scheduling).
-        """
-        # Momentum is already applied by _step_muon before this method.
-        param_to_state, ordered_params = self.init_state_and_assign_params(
-            names, params, group, qk_logits)
-        # Compute local rank for this group's shard process group.
-        shard_pg = param_to_state[id(ordered_params[0])].process_group
-        rank = dist.get_rank(group=shard_pg)
-        if self.chunk_size == -1:
-            shard_ranks = dist.get_world_size(param_to_state[id(
-                ordered_params[0])].process_group)
-            chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
-        elif self.chunk_size > 0:
-            chunk_size = self.chunk_size
-        else:
-            raise ValueError("chunk_size must be -1 or a positive integer.")
-        def pipelines():
-            for start in range(0, len(ordered_params), chunk_size):
-                chunk = ordered_params[start:start + chunk_size]
-                if chunk:
-                    yield muon_chunk_pipeline(
-                        params=chunk,
-                        param_to_state=param_to_state,
-                        rank=rank,
-                        ns_steps=group["ns_steps"],
-                        lr=lr,
-                        weight_decay=weight_decay,
-                        none_grad=group["none_grad"],
-                    )
-        with record_function("muon::barrier"):
-            dist.barrier()
-        with record_function("muon::pipeline"):
-            run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
-    def _step_muon(self, group, qk_logits=None):
-        params = group["params"]
-        lr = group["lr"]
-        weight_decay = group["weight_decay"]
-        momentum = group["momentum"]
-        names = group["names"]
-        # Apply momentum to all params before routing/expansion.
-        with record_function("muon::momentum"):
-            for n, p in zip(names, params):
-                g = p.grad
-                if g is None:
-                    continue
-                g = update_g(self.state, p, g, group, momentum)
-                p.grad = g
-        # Expand expert params by splitting on dim 0.
-        names, params = _expand_expert_params(names, params, self.expert_keys)
-        param_dtensors = []
-        name_dtensors = []
-        param_tensors = []
-        name_tensors = []
-        param_dtensors_small = []
-        name_dtensors_small = []
-        if self.use_distributed_muon:
-            self.distributed_muon(names=names,
-                                  params=params,
-                                  group=group,
-                                  lr=lr,
-                                  weight_decay=weight_decay,
-                                  qk_logits=qk_logits)
-            return
-        # For simplicity, we use distributed Muon for small parameters
-        # whose number of elements is below a threshold.
-        for n, p in zip(names, params):
-            if p is None or p.grad is None:
-                continue
-            if isinstance(p.data, DTensor):
-                if all(
-                        isinstance(placement, Replicate)
-                        for placement in p.placements):
-                    param_tensors.append(p)
-                    name_tensors.append(n)
-                elif p.data.numel() <= self.small_param_numel_threshold:
-                    param_dtensors_small.append(p)
-                    name_dtensors_small.append(n)
-                else:
-                    param_dtensors.append(p)
-                    name_dtensors.append(n)
-            elif isinstance(p.data, torch.Tensor):
-                param_tensors.append(p)
-                name_tensors.append(n)
-            else:
-                raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-        logger.debug(
-            f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors, "
-            f"{len(param_dtensors_small)} Small DTensors")
-        def group_dtensors(dtensors, names):
-            # To support different placements, we group parameters by placements
-            # and run parallel Muon on each group.
-            placement_to_params = defaultdict(lambda: ([], []))
-            assert len(dtensors) == len(names)
-            for p, n in zip(dtensors, names):
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][0].append(n)
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][1].append(p)
-            return placement_to_params
-        if len(param_dtensors_small) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            self.distributed_muon(
-                params=param_dtensors_small,
-                names=name_dtensors_small,
-                group=group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
-        if len(param_dtensors) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            dtensor_group = group_dtensors(param_dtensors, name_dtensors)
-            for _, (names, params) in dtensor_group.items():
-                self.parallel(
-                    names,
-                    params,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    qk_logits=qk_logits,
-                )
-        if len(param_tensors) > 0:
-            self.base(
-                name_tensors,
-                param_tensors,
-                group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
-    @torch.no_grad
-    def step(self, closure=None, qk_logits=None):
-        """Perform a single optimization step.
-        Args:
-            closure (Callable, optional): A closure that reevaluates the model
-                and returns the loss.
-            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
-                to 1D tensors of shape (num_heads,), representing the maximum
-                QK logits across all tokens, computed as
-                (1 / sqrt(head_dim)) * (Q @ K^T).
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        for group in self.param_groups:
-            if group["use_muon"]:
-                self._step_muon(group, qk_logits=qk_logits)
-            else:
-                step_adamw(self.state, group)
-        return loss

build/torch210-cxx11-cu126-x86_64-linux/newton_schulz.py DELETED Viewed

@@ -1,50 +0,0 @@
-import torch
-from .matmul_transpose_triton import matmul_transpose_assign
-COMM_DTYPE = torch.bfloat16
-DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
-@torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    assert G.dtype == COMM_DTYPE
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
-        matmul_transpose_assign(X, buf1)
-        matmul_transpose_assign(buf1, buf2)
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X

build/torch210-cxx11-cu126-x86_64-linux/optimizer/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu126-x86_64-linux/pipeline.py DELETED Viewed

@@ -1,390 +0,0 @@
-import logging
-from typing import Generator
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor
-from torch.profiler import record_function
-from .core import _muon_state, adjust_lr_for_muon, update_p
-from .newton_schulz import COMM_DTYPE, _zeropower_via_newtonschulz5
-from .qk_clip import compute_scales
-logger = logging.getLogger(__name__)
-# ======================================================================
-# Stage helpers
-# ======================================================================
-def _launch_gather(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
-    """Allocate gather buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_gather``).
-        gathered_grads: ``{id(p): empty_tensor}`` for owned params,
-            ``None`` for non-owned.
-        recv_counts: Per-source-rank element counts.
-    """
-    # Allocate gathered-grad buffers
-    gathered_grads: dict[int, torch.Tensor | None] = {}
-    for p in params:
-        state = param_to_state[id(p)]
-        if rank == state.worker_rank:
-            gathered_grads[id(p)] = torch.empty(p.shape,
-                                                dtype=COMM_DTYPE,
-                                                device="cuda")
-        else:
-            gathered_grads[id(p)] = None
-    # Build send buffer
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
-    send_counts = [0] * num_ranks
-    for p in params:
-        state = param_to_state[id(p)]
-        dst = state.worker_rank
-        assert dst < num_ranks
-        shard_elems = state.rank_numels[rank]
-        g = p.grad
-        g = g.to_local().to(COMM_DTYPE).contiguous()
-        assert g.numel() == shard_elems
-        per_dst[dst].append(g.view(-1))
-        send_counts[dst] += shard_elems
-    assert any(
-        len(v) > 0 for v in
-        per_dst), "At least one destination rank must receive a sharded tensor"
-    per_dst_flat = [t for dst in per_dst for t in dst]
-    send_buf = torch.cat(per_dst_flat, dim=0)
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            total += state.rank_numels[src]
-        recv_counts[src] = total
-    recv_buf = torch.empty(sum(recv_counts), dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    logger.debug(f"send_buf size: {send_buf.numel()}, "
-                 f"recv_buf size: {recv_buf.numel()}, "
-                 f"recv_counts: {recv_counts}, "
-                 f"send_counts: {send_counts}, "
-                 f"process_group: {str(process_group)}")
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, gathered_grads, recv_counts
-def _complete_gather(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-) -> None:
-    """Reconstruct gathered grads from the recv buffer (in-place)."""
-    off = 0
-    for src in range(len(recv_counts)):
-        if recv_counts[src] == 0:
-            continue
-        block = recv_counts[src]
-        inner_off = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            indices = state.rank_indices[src]
-            shard_view = gathered_grads[id(p)][indices]
-            n = shard_view.numel()
-            assert n > 0
-            sg = recv_buf.narrow(0, off + inner_off, n)
-            sg = sg.reshape(shard_view.shape)
-            gathered_grads[id(p)][indices] = sg
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _compute_ns(
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    ns_steps: int,
-) -> dict[int, torch.Tensor | None]:
-    """Run Newton-Schulz orthogonalization on owned parameters.
-    Returns:
-        computed_us: ``{id(p): orthogonalized_update}`` for owned params.
-    """
-    computed_us: dict[int, torch.Tensor | None] = {}
-    for p in owned_params:
-        u = _zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
-        gathered_grads[id(p)] = None  # free gathered grad
-        computed_us[id(p)] = u
-    return computed_us
-def _launch_scatter(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-    computed_us: dict[int, torch.Tensor | None],
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor], list[int]]:
-    """Allocate scatter buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
-        scattered_us: ``{id(p): empty_local_tensor}`` for all params.
-        recv_counts: Per-source-rank element counts.
-    """
-    # Allocate scattered-u buffers
-    scattered_us: dict[int, torch.Tensor] = {}
-    for p in params:
-        scattered_us[id(p)] = torch.empty_like(p.to_local(), dtype=COMM_DTYPE)
-    # Build send buffer (from computed_us on owner ranks)
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
-    send_counts = [0] * num_ranks
-    if owned_params:
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert computed_us[id(p)] is not None
-            u_full = computed_us[id(p)].to(COMM_DTYPE).contiguous()
-            total_sent = 0
-            for dst_rank in range(num_ranks):
-                indices = state.rank_indices[dst_rank]
-                su = u_full[indices].flatten()
-                n = su.numel()
-                assert n > 0
-                per_dst[dst_rank].append(su)
-                send_counts[dst_rank] += n
-                total_sent += n
-            assert total_sent == u_full.numel()
-    lengths = [len(v) for v in per_dst]
-    if all(l > 0 for l in lengths):
-        assert all(
-            l == lengths[0] for l in lengths
-        ), "All destination ranks must have the same number of sharded tensor"
-        per_dst_flat = [t for dst in per_dst for t in dst]
-        send_buf = torch.cat(per_dst_flat, dim=0)
-    else:
-        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            total += state.rank_numels[rank]
-        recv_counts[src] = total
-    recv_total = sum(recv_counts)
-    assert recv_total > 0
-    recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, scattered_us, recv_counts
-def _complete_scatter(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-) -> None:
-    """Copy recv buffer into scattered_us (in-place)."""
-    off = 0
-    for src in range(len(recv_counts)):
-        block = recv_counts[src]
-        if block == 0:
-            continue
-        inner_off = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            n = state.rank_numels[rank]
-            assert n > 0
-            flat_local = recv_buf.narrow(0, off + inner_off,
-                                         n).view_as(p.to_local())
-            scattered_us[id(p)].copy_(flat_local)
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _update_params(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-    lr: float,
-    weight_decay: float,
-) -> None:
-    """Apply weight decay, Muon update, and optional QK clipping."""
-    for p in params:
-        state = param_to_state[id(p)]
-        u_dtensor = DTensor.from_local(
-            scattered_us[id(p)],
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-        update_p(p, u_dtensor, lr, adjusted_lr, weight_decay)
-        # QK clipping – applied directly on the local tensor to
-        # avoid DTensor sharding-propagation issues with _StridedShard.
-        scales_full = compute_scales(
-            p,
-            state.qk_clip_state) if state.qk_clip_state is not None else None
-        if scales_full is not None:
-            ratio = p.shape[0] // scales_full.shape[0]
-            idx0 = state.rank_indices[rank][0]
-            if isinstance(idx0, slice):
-                start = idx0.start or 0
-                idx0 = torch.arange(start,
-                                    idx0.stop,
-                                    device=scales_full.device)
-            row_scales = scales_full[idx0 // ratio]
-            p._local_tensor.mul_(row_scales.view(-1, 1))
-# ======================================================================
-# Main generator – thin orchestrator that wires stages together.
-# ======================================================================
-@torch.no_grad()
-def muon_chunk_pipeline(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    ns_steps: int,
-    lr: float,
-    weight_decay: float,
-    none_grad: bool,
-) -> Generator[None, None, None]:
-    """Process one chunk of parameters through the full Muon pipeline.
-    Stages: gather -> compute (Newton-Schulz) -> scatter -> update.
-    Each ``yield`` lets :func:`run_pipeline` interleave other chunks so
-    that communication and computation overlap across chunks.  Async
-    communication is launched via ``async_op=True`` and completed after
-    the yield with ``work.wait()``.
-    Overlap happens because :func:`run_pipeline` admits one new chunk
-    per iteration (staggered admission).  While chunk *N* does NS
-    compute on the default CUDA stream, chunk *N+1*'s async all-to-all
-    runs concurrently on the NCCL stream — no separate ``comm_stream``
-    is required.
-    Yields exactly **2** times:
-    1. After launching async all-to-all gather.
-    2. After launching async all-to-all scatter.
-    """
-    process_group = param_to_state[id(params[0])].process_group
-    num_ranks = dist.get_world_size(group=process_group)
-    owned_params = [
-        p for p in params if param_to_state[id(p)].worker_rank == rank
-    ]
-    # Stages 1-2: launch async gather.
-    with record_function("muon::launch_gather"):
-        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group)
-        if none_grad:
-            for p in params:
-                p.grad = None
-    yield  # --- YIELD 1: other chunks can launch their gather ---
-    with record_function("muon::wait_gather"):
-        work.wait()
-        _complete_gather(recv_buf, recv_counts, owned_params, gathered_grads,
-                         param_to_state, rank)
-        del recv_buf
-    # Stage 3: Newton-Schulz orthogonalization.
-    with record_function("muon::newton_schulz"):
-        computed_us = _compute_ns(owned_params, gathered_grads, ns_steps)
-        gathered_grads.clear()
-    # Stages 4-5: launch async scatter.
-    with record_function("muon::launch_scatter"):
-        work, recv_buf, scattered_us, recv_counts = _launch_scatter(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group, computed_us)
-        computed_us.clear()
-    yield  # --- YIELD 2: other chunks can launch their scatter ---
-    with record_function("muon::wait_scatter"):
-        work.wait()
-        _complete_scatter(recv_buf, recv_counts, params, param_to_state, rank,
-                          scattered_us)
-        del recv_buf
-    # Stage 6: apply parameter updates.
-    with record_function("muon::update_params"):
-        _update_params(params, param_to_state, rank, scattered_us, lr,
-                       weight_decay)
-        scattered_us.clear()

build/torch210-cxx11-cu126-x86_64-linux/qk_clip.py DELETED Viewed

@@ -1,129 +0,0 @@
-import logging
-import math
-from dataclasses import dataclass
-import torch
-from torch.distributed.tensor import DTensor
-logger = logging.getLogger(__name__)
-def parse_qk_layer(name: str) -> tuple[str | None, int]:
-    """
-    Parse a parameter name to check if it is a query/key projection layer
-    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
-    Returns:
-        (kind, layer_idx) or (None, -1) if not matched.
-    Example:
-        'model.3.attn.wq.weight'      -> ('wq', 3)
-        'model.5.attn.wk.weight'      -> ('wk', 5)
-        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
-        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
-        'model.4.attn.v_proj.weight'  -> (None, -1)
-    """
-    parts = name.split('.')
-    if len(parts) < 3:
-        return None, -1
-    kind = parts[-2]
-    layer_idx = -1
-    for part in reversed(parts):
-        if part.isdigit():
-            layer_idx = int(part)
-            break
-    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
-        return kind, layer_idx
-    return None, -1
-@dataclass
-class QKClipInfo:
-    """Per-parameter dynamic info computed from config + runtime logits."""
-    kind: str | None  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
-    indices: list[int]  # which heads to consider for clipping
-    head_dim: int  # from config
-    threshold: float  # from config
-    logit: torch.Tensor | None
-def get_qk_clip_info(clip_config, n, qk_logits):
-    """Extract QK clipping info for a named parameter.
-    Args:
-        clip_config: QK clipping configuration dict (or None).
-        n: Parameter name string.
-        qk_logits: Dict mapping layer indices to logit tensors (or None).
-    Returns:
-        QKClipInfo instance with clipping configuration for this parameter.
-    """
-    if clip_config is None:
-        return None
-    head_dim = clip_config.get('head_dim')
-    threshold = clip_config.get('threshold')
-    kind, layer_idx = parse_qk_layer(n)
-    logit, indices = None, []
-    if qk_logits is not None and kind is not None:
-        logit = qk_logits[layer_idx]
-        indices_key = 'q_indices' if 'q' in kind else 'k_indices'
-        indices = clip_config.get(indices_key, []) or []
-        if isinstance(logit, DTensor):
-            # In TP settings, qk_logits may be DTensor
-            # We convert it to full tensor here for simplicity
-            logit = logit.full_tensor()
-    return QKClipInfo(
-        kind=kind,
-        indices=indices,
-        head_dim=head_dim,
-        threshold=threshold,
-        logit=logit,
-    )
-def compute_scales(p, qk_clip_state):
-    """Compute per-head scaling factors for QK clipping.
-    Returns scales tensor if any head exceeds threshold, else None.
-    """
-    kind = qk_clip_state.kind
-    indices = qk_clip_state.indices
-    head_dim = qk_clip_state.head_dim
-    threshold = qk_clip_state.threshold
-    logit = qk_clip_state.logit
-    H_global = p.shape[0] // head_dim
-    scales_full = torch.ones(H_global, device=p.data.device)
-    scaling = 0
-    for logit_idx, head_idx in enumerate(indices):
-        v_ele = float(logit[logit_idx])
-        if v_ele > threshold:
-            new_scale = math.sqrt(threshold / v_ele)
-            if new_scale < scales_full[head_idx]:
-                scales_full[head_idx] = new_scale
-                logger.info(
-                    f"[{kind}] Head {head_idx} exceeded threshold "
-                    f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
-                )
-                scaling += 1
-    return scales_full if scaling > 0 else None
-def qk_clip(p, scales, head_dim):
-    """Apply per-head scaling to a Q/K projection weight matrix."""
-    if isinstance(p, torch.nn.Parameter):
-        W = p.data.view(-1, head_dim, p.data.shape[1])
-        W.mul_(scales.view(-1, 1, 1))
-    else:
-        W = p.view(-1, head_dim, p.shape[1])
-        W.mul_(scales.view(-1, 1, 1))

build/torch210-cxx11-cu128-x86_64-linux/adamw.py DELETED Viewed

@@ -1,154 +0,0 @@
-from collections import defaultdict
-from typing import cast
-import torch
-from torch.distributed.tensor import DTensor
-def fused_adamw(
-    params: list[torch.Tensor],
-    grads: list[torch.Tensor],
-    exp_avgs: list[torch.Tensor],
-    exp_avg_sqs: list[torch.Tensor],
-    max_exp_avg_sqs: list[torch.Tensor],
-    state_steps: list[torch.Tensor],
-    amsgrad: bool,
-    beta1: float,
-    beta2: float,
-    lr: float | torch.Tensor,
-    weight_decay: float,
-    eps: float,
-    maximize: bool,
-) -> None:
-    if not params:
-        return
-    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-    # treating it as a scalar.
-    lr_dict: dict | None = ({
-        lr.device: lr
-    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
-    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-         state_steps]  # type: ignore[list-item]
-    )
-    for (device, _), (
-        (
-            device_params_,
-            device_grads_,
-            device_exp_avgs_,
-            device_exp_avg_sqs_,
-            device_max_exp_avg_sqs,
-            device_state_steps_,
-        ),
-            _,
-    ) in grouped_tensors.items():
-        device_params = cast(list[torch.Tensor], device_params_)
-        device_grads = cast(list[torch.Tensor], device_grads_)
-        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-        if lr_dict is not None and device not in lr_dict:
-            lr_dict[device] = lr.to(
-                device=device, non_blocking=True)  # type: ignore[union-attr]
-            lr = lr_dict[device]
-        torch._foreach_add_(device_state_steps, 1)
-        func = torch._fused_adamw_
-        func(
-            device_params,
-            device_grads,
-            device_exp_avgs,
-            device_exp_avg_sqs,
-            device_max_exp_avg_sqs,  # type: ignore[arg-type]
-            device_state_steps,
-            amsgrad=amsgrad,
-            lr=lr,  # type: ignore[arg-type]
-            beta1=beta1,
-            beta2=beta2,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=maximize,
-        )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
-    params_with_grads = []
-    grads = []
-    moment1 = []
-    moment2 = []
-    max_exp_avg_sqs = []
-    state_steps = []
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    for p in params:
-        g = p.grad
-        if g is None:
-            continue
-        state = optimizer_state[p]
-        params_with_grads.append(p)
-        grads.append(g)
-        if "step" not in state:
-            state["step"] = (torch.zeros((),
-                                         dtype=torch.float32,
-                                         device=p.device))
-            state["moment1"] = torch.zeros_like(g)
-            state["moment2"] = torch.zeros_like(g)
-        moment1.append(state["moment1"])
-        moment2.append(state["moment2"])
-        if not isinstance(state["step"], torch.Tensor):
-            step_tensor = torch.tensor(state["step"],
-                                       dtype=torch.float32,
-                                       device=p.device)
-        else:
-            step_tensor = state["step"]
-        state_steps.append(step_tensor)
-    fused_adamw(
-        params_with_grads,
-        grads,
-        moment1,
-        moment2,
-        max_exp_avg_sqs,
-        state_steps,
-        amsgrad=False,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        eps=eps,
-        maximize=False,
-    )
-def step_adamw(optimizer_state, group):
-    """Dispatch AdamW step, grouping parameters by type and placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        group: Parameter group dict.
-    """
-    params = group["params"]
-    # group params with its type and placement
-    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
-    for p in params:
-        match p:
-            case DTensor():
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])].append(p)
-            case torch.Tensor():
-                placement_to_params[tuple([torch.Tensor, None])].append(p)
-    for group_params in placement_to_params.values():
-        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu128-x86_64-linux/async_utils.py DELETED Viewed

@@ -1,77 +0,0 @@
-import logging
-from typing import Generator
-logger = logging.getLogger(__name__)
-class _Task:
-    """Internal: wraps a generator, advances one yield at a time."""
-    def __init__(self, generator: Generator[None, None, None], index: int):
-        self._generator = generator
-        self._index = index
-        self._steps_completed = 0
-        self.step()  # run to first yield
-    def step(self) -> bool:
-        try:
-            next(self._generator)
-            self._steps_completed += 1
-            logger.debug("pipeline[%d] completed stage %d", self._index,
-                         self._steps_completed)
-            return True
-        except StopIteration:
-            logger.debug("pipeline[%d] finished after %d stages", self._index,
-                         self._steps_completed)
-            return False
-    def close(self):
-        self._generator.close()
-def run_pipeline(
-    pipelines: Generator[Generator[None, None, None], None, None],
-    max_concurrent: int,
-) -> None:
-    """Run generator-based pipelines with bounded concurrency.
-    Each pipeline is a generator that yields at stage boundaries.
-    The runtime interleaves pipelines so communication and computation
-    overlap across chunks.
-    """
-    if max_concurrent <= 0:
-        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
-    have_new = True
-    task_index = 0
-    previous_tasks: list[_Task] = []
-    try:
-        while have_new or previous_tasks:
-            running_tasks: list[_Task] = []
-            # Admit one new pipeline per iteration (staggered admission).
-            # Admitting one at a time ensures that while chunk N does NS
-            # compute on the default stream, chunk N+1's NCCL all-to-all
-            # runs concurrently on the NCCL stream — creating real
-            # communication/computation overlap on the GPU.
-            if have_new and len(previous_tasks) < max_concurrent:
-                try:
-                    gen = next(pipelines)
-                    task = _Task(gen, task_index)
-                    task_index += 1
-                    running_tasks.append(task)
-                except StopIteration:
-                    have_new = False
-            # Advance every previously-yielded task by one step.
-            for task in previous_tasks:
-                if task.step():
-                    running_tasks.append(task)
-            previous_tasks = running_tasks
-    except BaseException:
-        # Clean up all in-flight generators to release GPU resources.
-        for task in previous_tasks:
-            task.close()
-        raise

build/torch210-cxx11-cu128-x86_64-linux/core.py DELETED Viewed

@@ -1,116 +0,0 @@
-import math
-from dataclasses import dataclass
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.tensor import DTensor
-@dataclass
-class _muon_state:
-    worker_rank: int
-    process_group: ProcessGroup
-    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
-    rank_numels: dict[int, int]  # local_rank -> numel
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-def update_g(optimizer_state, p, g, group, momentum):
-    """Apply momentum update to gradient.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        p: Parameter tensor.
-        g: Gradient tensor.
-        group: Parameter group dict.
-        momentum: Momentum coefficient.
-    Returns:
-        Momentum-updated gradient tensor.
-    """
-    state = optimizer_state[p]
-    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-    torch.add(g, buf, alpha=momentum, out=buf)
-    if group["nesterov"]:
-        g.add_(buf, alpha=momentum)
-        return g
-    return buf
-def update_p(p, u, lr, adjusted_lr, weight_decay):
-    """Apply weight decay and orthogonalized update to parameter.
-    Args:
-        p: Parameter (torch.nn.Parameter or DTensor).
-        u: Orthogonalized update tensor.
-        lr: Base learning rate.
-        adjusted_lr: Size-adjusted learning rate.
-        weight_decay: Weight decay coefficient.
-    """
-    if isinstance(p, torch.nn.Parameter):
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    else:
-        p.mul_(1 - lr * weight_decay)
-        p.add_(u, alpha=-adjusted_lr)
-def adjust_lr_for_muon(lr, param_shape):
-    """Scale learning rate based on parameter matrix dimensions.
-    Args:
-        lr: Base learning rate.
-        param_shape: Shape of the parameter tensor.
-    Returns:
-        Adjusted learning rate.
-    """
-    A, B = param_shape[:2]
-    # We adjust the learning rate and weight decay based on the size of the parameter matrix
-    # as described in the paper
-    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-    adjusted_lr = lr * adjusted_ratio
-    return adjusted_lr
-def default_is_muon(name, x, expert_keys=None):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    if any(key in name for key in skip_keys):
-        return False
-    effective_ndim = x.ndim
-    if expert_keys and any(key in name for key in expert_keys):
-        effective_ndim -= 1
-    return effective_ndim >= 2
-def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
-    if is_muon_func is None:
-        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
-    muon_params, muon_names = [], []
-    non_muon_params = []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
-            continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]

build/torch210-cxx11-cu128-x86_64-linux/distributed/utils.py DELETED Viewed

@@ -1,234 +0,0 @@
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import DTensor
-from torch.distributed.tensor.placement_types import (Placement, Shard,
-                                                      _StridedShard)
-def _is_shard(placement: Placement) -> bool:
-    """Check if a placement is a shard type (Shard or _StridedShard).
-    In PyTorch 2.10+, _StridedShard no longer inherits from Shard, so
-    ``placement.is_shard()`` returns False for _StridedShard.  This helper
-    handles both old and new hierarchies.
-    """
-    return isinstance(placement, (Shard, _StridedShard))
-def get_slices_of_dtensor(
-    target: DTensor | torch.Tensor,
-    local_rank: int,
-    shard_mesh: DeviceMesh,
-    shard_placements: tuple[Placement],
-) -> tuple[slice | torch.Tensor, ...]:
-    """
-    Get per-dimension indices for a given rank's shard of the target tensor.
-    Uses ``Shard.local_shard_size_and_offset`` and
-    ``_StridedShard.local_shard_size_and_offset`` for correct handling of
-    both contiguous and strided (non-contiguous) sharding.
-    Args:
-        target (DTensor | torch.Tensor): The target tensor (for its shape).
-        local_rank (int): The local rank within the shard group.
-        shard_mesh (DeviceMesh): The shard mesh (only shard dimensions).
-        shard_placements (tuple[Placement]): The shard placements.
-    Returns:
-        A tuple of indices (one per tensor dim).  Each element is either:
-        - A ``slice`` (for contiguous or unsharded dims)
-        - A 1-D ``torch.LongTensor`` of indices (for strided sharding)
-    """
-    # find the global rank of the local rank in the shard mesh
-    rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
-    rank_coords = (shard_mesh.mesh == rank).nonzero()
-    assert len(rank_coords) == 1
-    rank_coords = tuple(rank_coords[0].tolist())
-    assert len(rank_coords) == len(shard_placements)
-    # Track per-shard-dim indices.
-    # None means "not yet sharded on this dim".
-    dim_indices: dict[int, torch.Tensor] = {}
-    # Caution: Assuming replicate-to-shard of the shard mesh goes with
-    # left-to-right sharding. This is ensured by the sorting logic of
-    # construct_shard_mesh function.
-    for mesh_dim_idx, (rank_coord, placement) in enumerate(
-            zip(rank_coords, shard_placements)):
-        assert _is_shard(placement)
-        num_chunks = shard_mesh.mesh.shape[mesh_dim_idx]
-        shard_dim = placement.dim
-        # Current effective size on this dim (may already be sub-sharded)
-        if shard_dim in dim_indices:
-            curr_size = len(dim_indices[shard_dim])
-        else:
-            curr_size = target.size()[shard_dim]
-        if curr_size % num_chunks != 0:
-            raise NotImplementedError(
-                f"Dimension size {curr_size} is not divisible "
-                f"by number of ranks {num_chunks} for shard "
-                f"placement on dim {shard_dim}. (shape: {target.shape})")
-        # Compute indices for this level of sharding
-        if isinstance(placement, _StridedShard):
-            _shard_size, offsets = _StridedShard.local_shard_size_and_offset(
-                placement,
-                curr_size,
-                num_chunks,
-                rank_coord,
-                return_first_offset=False)
-            new_indices = torch.tensor(offsets, dtype=torch.long)
-        else:
-            shard_size, offset = Shard.local_shard_size_and_offset(
-                curr_size, num_chunks, rank_coord)
-            new_indices = torch.arange(offset,
-                                       offset + shard_size,
-                                       dtype=torch.long)
-        # Compose with previous indices on this dim
-        if shard_dim in dim_indices:
-            dim_indices[shard_dim] = dim_indices[shard_dim][new_indices]
-        else:
-            dim_indices[shard_dim] = new_indices
-    # Build result tuple
-    result: list[slice | torch.Tensor] = []
-    for d in range(len(target.size())):
-        if d not in dim_indices:
-            result.append(slice(None))
-        else:
-            indices = dim_indices[d]
-            # Convert contiguous indices to slice for efficiency
-            if len(indices) > 0:
-                start = indices[0].item()
-                expected = torch.arange(start,
-                                        start + len(indices),
-                                        dtype=torch.long)
-                if torch.equal(indices, expected):
-                    result.append(slice(start, start + len(indices)))
-                else:
-                    result.append(indices)
-            else:
-                result.append(slice(0, 0))
-    return tuple(result)
-_ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
-                                                  ProcessGroup]] = dict()
-def construct_shard_mesh(
-    placements: tuple[Placement],
-    mesh: DeviceMesh,
-) -> tuple[DeviceMesh, ProcessGroup, tuple[Placement, ...]]:
-    """Construct shard sub-mesh and ProcessGroup for all-to-all communication.
-    Given a DTensor's placements and device mesh, extracts the "shard group"
-    — the set of ranks that together hold all shards of the same replica —
-    and creates a ProcessGroup for all-to-all among them.
-    Steps:
-        1. Sort placements: Replicate first, then Shard by (dim, granularity).
-        2. Permute the mesh tensor to match the sorted order.
-        3. Collapse Replicate dims → list of shard sub-meshes (one per replica).
-        4. Create/retrieve a cached ProcessGroup for the current rank's sub-mesh.
-    Example — 8 GPUs, mesh shape (2, 2, 2),
-              placements ``[Shard(0), Replicate, _StridedShard(0)]``::
-        Step 1 — Sort: [Replicate, _StridedShard(0), Shard(0)]
-                 Permutation: [1, 2, 0]
-        Step 2 — Permute mesh dims by [1, 2, 0]:
-                 Original:                Permuted:
-                 [[[0,1],[2,3]],          [[[0,2],[1,3]],
-                  [[4,5],[6,7]]]           [[4,6],[5,7]]]
-        Step 3 — Unbind replicate dim (dim 0), giving 2 shard sub-meshes:
-                 sub-mesh 0 = [[0,2],[1,3]]  (replica group 0)
-                 sub-mesh 1 = [[4,6],[5,7]]  (replica group 1)
-                 shard_placements = (_StridedShard(0), Shard(0))
-        Step 4 — Rank 0 → ProcessGroup([0,1,4,5])
-                 Rank 2 → ProcessGroup([2,3,6,7])
-    Returns:
-        ``(shard_mesh, process_group, shard_placements)``
-    """
-    my_rank = dist.get_rank()
-    assert mesh.mesh.device.type == 'cpu'
-    # -- Fast path: 1D all-shard mesh → reuse existing PG. ----------------
-    # This avoids a non-collective dist.new_group() call, which would
-    # deadlock when only a subset of ranks call this function (e.g. expert
-    # DTensors on a TP submesh where ranks 0-3 and 4-7 call separately).
-    if mesh.ndim == 1 and len(placements) == 1 and _is_shard(placements[0]):
-        key = (*mesh.mesh.shape, *mesh.mesh.flatten().tolist())
-        if key not in _ranks_to_dist_cache:
-            _ranks_to_dist_cache[key] = (mesh, mesh.get_group())
-        return (*_ranks_to_dist_cache[key], tuple(placements))
-    mesh_tensor = mesh.mesh.clone()
-    # -- Step 1: Sort placements (Replicate first, then Shard by dim). ------
-    # _StridedShard comes BEFORE regular Shard on the same dim so that
-    # get_slices_of_dtensor applies the outer sharding first, matching
-    # DTensor's left-to-right (outer-to-inner) composition order.
-    def _sort_key(item):
-        index, placement = item
-        assert not placement.is_partial(), "Partial placement not supported"
-        if placement.is_replicate():
-            return (-1, 0, index)
-        assert _is_shard(placement), f"Unsupported: {type(placement)}"
-        split = (-1 / placement.split_factor if isinstance(
-            placement, _StridedShard) else 0)
-        return (placement.dim, split, index)
-    indexed = sorted(enumerate(placements), key=_sort_key)
-    perm, sorted_placements = zip(*indexed)
-    # -- Step 2: Permute mesh to match sorted placement order. --------------
-    sorted_mesh = mesh_tensor.permute(perm)
-    # -- Step 3: Collapse replicate dims → list of shard sub-meshes. --------
-    # E.g. mesh (2, 3, 4, 4) with [R, R, S(0), S(1)] → 6 sub-meshes of (4, 4)
-    num_rep = sum(1 for p in sorted_placements if p.is_replicate())
-    if num_rep > 0:
-        if num_rep > 1:
-            sorted_mesh = sorted_mesh.flatten(0, num_rep - 1)
-        shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
-    else:
-        shard_meshes = [sorted_mesh]
-    shard_placements = sorted_placements[num_rep:]
-    assert len(shard_placements) == len(set(shard_placements))
-    # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
-    # All ranks must call dist.new_group in the same order, even though each
-    # rank only joins one group.
-    def _cache_key(t: torch.Tensor) -> tuple:
-        return (*t.shape, *t.flatten().tolist())
-    my_key = None
-    for sm in shard_meshes:
-        key = _cache_key(sm)
-        if (my_rank == sm).any().item():
-            assert my_key is None, "Rank appears in multiple shard groups"
-            my_key = key
-        if key not in _ranks_to_dist_cache:
-            pg = dist.new_group(sm.flatten().tolist())
-            _ranks_to_dist_cache[key] = (
-                DeviceMesh(device_type="cuda", mesh=sm),
-                pg,
-            )
-    return (*_ranks_to_dist_cache[my_key], shard_placements)

build/torch210-cxx11-cu128-x86_64-linux/matmul_transpose_triton.py DELETED Viewed

@@ -1,121 +0,0 @@
-# MIT License
-#
-# Copyright (c) 2025 Tianyang Lin
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import torch
-import triton
-import triton.language as tl
-def get_autotune_config():
-    return [
-        triton.Config(
-            {
-                'BLOCK_SIZE_M': blk_m,
-                'BLOCK_SIZE_K': blk_k,
-                'GROUP_SIZE_M': grp_sz
-            },
-            num_stages=n_stages,
-            num_warps=n_warps) for blk_m in [32, 64, 128]
-        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
-        for n_warps in [4, 8]
-    ]
-@triton.autotune(
-    configs=get_autotune_config(),
-    key=['M', 'K'],
-)
-@triton.jit
-def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
-               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-               GROUP_SIZE_M: tl.constexpr):
-    """
-    Core kernel jit function of matmul_transpose that computes y = x @ x.T
-    The code is a simple adaptation from the triton `matmul` tutorial:
-    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
-    """
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    if pid_m > pid_n:
-        return
-    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    # we use a & b ptrs to denote different rows of x.
-    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl.load(a_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
-        a_ptrs += BLOCK_SIZE_K * stride_xk
-        b_ptrs += BLOCK_SIZE_K * stride_xk
-    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
-    # https://github.com/triton-lang/triton/issues/2252
-    c = accumulator.to(x.dtype.element_ty)
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
-    tl.store(c_ptrs, c, mask=c_mask)
-    # transpose and copy
-    if pid_m < pid_n:
-        ct_ptrs = y + stride_ym * offs_cn[:,
-                                          None] + stride_yn * offs_cm[None, :]
-        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
-        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-def matmul_transpose_assign(d_in, d_out):
-    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
-    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
-    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
-    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
-    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
-    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
-    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
-            "First dimension of `d_in` must match first and second dimension of `d_out`"
-    d_in = d_in.contiguous()
-    M, K = d_in.shape
-    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
-        M, META['BLOCK_SIZE_M']), )
-    with torch.cuda.device(d_in.device.index):
-        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
-                         d_out.stride(0), d_out.stride(1))

build/torch210-cxx11-cu128-x86_64-linux/metadata.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "python-depends": []
-}

build/torch210-cxx11-cu128-x86_64-linux/muon.py DELETED Viewed

@@ -1,594 +0,0 @@
-import logging
-import types
-from collections import defaultdict
-from typing import Any
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor, Replicate, Shard
-from torch.profiler import record_function
-from .adamw import step_adamw
-from .async_utils import run_pipeline
-from .core import (_muon_state, adjust_lr_for_muon,
-                   get_default_muon_param_groups, update_g, update_p)
-from .distributed.utils import (_is_shard, construct_shard_mesh,
-                                get_slices_of_dtensor)
-from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
-                            _zeropower_via_newtonschulz5)
-from .pipeline import muon_chunk_pipeline
-from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
-logger = logging.getLogger(__name__)
-def _expand_expert_params(names, params, expert_keys):
-    """Expand expert params by splitting on dim 0 (expert dimension).
-    Params whose name matches any key in ``expert_keys`` are treated as
-    expert-parallel tensors.  Their outermost dimension is the expert
-    dimension: an ``(E, out, in)`` tensor becomes ``E`` separate 2D
-    ``nn.Parameter`` views so that in-place updates propagate back to
-    the original storage.
-    Non-expert params with ``ndim > 2`` trigger an ``AssertionError`` —
-    if they are expert params, their key must be added to ``expert_keys``.
-    The grad must already be set on each expert param (e.g. after momentum).
-    For DTensor expert params, placements that shard on dim 0 (expert dim)
-    are consumed by the split.  Non-dim-0 shard placements (e.g. TP) are
-    preserved: each 2D slice is wrapped as a DTensor on the corresponding
-    submesh so the parallel pipeline handles the TP communication.
-    """
-    expanded_names = []
-    expanded_params = []
-    for n, p in zip(names, params):
-        is_expert = expert_keys and any(key in n for key in expert_keys)
-        is_dtensor = isinstance(p.data, DTensor)
-        if not is_expert:
-            assert p.data.ndim <= 2, (
-                f"Param {n} has ndim={p.data.ndim} but does not match "
-                f"expert_keys={expert_keys}. If this is an expert param, "
-                f"add its key to expert_keys.")
-            expanded_names.append(n)
-            expanded_params.append(p)
-            continue
-        g = p.grad
-        assert g is not None, (
-            f"Expert param {n} must have grad set before expansion")
-        tp_mesh = None
-        tp_placements_2d = None
-        if is_dtensor:
-            local_data = p.to_local()
-            local_grad = g.to_local() if isinstance(g, DTensor) else g
-            # Find non-dim-0 shard placements (e.g. TP sharding).
-            # After splitting on dim 0, Shard(k) becomes Shard(k-1).
-            tp_dim_indices = []
-            tp_placements_2d = []
-            for i, pl in enumerate(p.placements):
-                if _is_shard(pl) and pl.dim != 0:
-                    tp_dim_indices.append(i)
-                    tp_placements_2d.append(Shard(pl.dim - 1))
-            if tp_dim_indices:
-                tp_dim_names = tuple(p.device_mesh.mesh_dim_names[i]
-                                     for i in tp_dim_indices)
-                if len(tp_dim_names) == 1:
-                    tp_mesh = p.device_mesh[tp_dim_names[0]]
-                else:
-                    tp_mesh = p.device_mesh[tp_dim_names]
-        else:
-            local_data = p.data
-            local_grad = g
-        # Expand: split dim 0, reshape each slice to 2D.
-        num_local_experts = local_data.shape[0]
-        for i in range(num_local_experts):
-            slice_data = local_data[i]
-            slice_grad = local_grad[i]
-            if tp_mesh is not None:
-                # Wrap as DTensor on TP submesh so the pipeline handles
-                # TP communication (gather/scatter across TP ranks).
-                dt_data = DTensor.from_local(slice_data,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                dt_grad = DTensor.from_local(slice_grad,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                expert_param = torch.nn.Parameter(dt_data, requires_grad=False)
-                expert_param.grad = dt_grad
-            else:
-                expert_param = torch.nn.Parameter(slice_data,
-                                                  requires_grad=False)
-                expert_param.grad = slice_grad
-            expanded_names.append(f"{n}[{i}]")
-            expanded_params.append(expert_param)
-        p.grad = None  # allow expert grad storage to be freed after pipeline
-    return expanded_names, expanded_params
-class Muon(torch.optim.Optimizer):
-    """
-    Muon - MomentUm Orthogonalized by Newton-schulz
-    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
-    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
-    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
-    the advantage that it can be stably run in bfloat16 on the GPU.
-    Some warnings:
-    - We believe this optimizer is unlikely to work well for training with small batch size.
-    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
-    Arguments:
-        model: The model to be optimized by Muon.
-        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
-        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
-        momentum: The momentum used by the internal SGD. (0.95 is a good default)
-        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
-        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        weight_decay: The weight decay for Muon and AdamW.
-            Parameters that are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW instead.
-        adamw_lr: The learning rate for the internal AdamW.
-        adamw_betas: The betas for the internal AdamW.
-        adamw_eps: The epsilon for the internal AdamW.
-        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
-        debug: Whether to print debug information.
-        clip_info : Configuration for QK clipping. Expected keys:
-            - "q_indices" (list[int]): Indices of query heads to consider.
-            - "k_indices" (list[int]): Indices of key heads to consider.
-            - "head_dim" (int): Dimensionality of each attention head.
-            - "threshold" (float): Threshold value; heads whose QK logits exceed
-            this value will be scaled down.
-            Default is:
-                {
-                    "q_indices": [],
-                    "k_indices": [],
-                    "head_dim": 128,
-                    "threshold": 100
-                }
-        warmup_step : How many all2all gather, compute operations are launched in advance
-                      before the corresponding all2all scatter steps begin.
-                      A higher warmup_step increases memory usage but can improve
-                      performance by overlapping communication.
-                      Parallel muon only.
-        chunk_size : Batch size of parameters to process in each
-                     all2all gather/compute/scatter step.
-                     Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
-        use_distributed_muon: Use distributed muon by Liu et al. (2024).
-                              For testing purpose only.
-        small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
-        expert_keys: List of strings to identify expert-parallel parameters.
-                     If any key appears in a parameter's name, its outermost
-                     dimension is treated as the expert dimension and expanded
-                     into per-expert 2D params for Muon.  For example,
-                     ``expert_keys=["experts"]`` matches any param whose name
-                     contains "experts".  3D+ params not matched by any key
-                     will raise an error.
-    """
-    def __init__(self,
-                 params,
-                 lr=1e-3,
-                 momentum=0.95,
-                 nesterov=True,
-                 ns_steps=5,
-                 weight_decay=0.1,
-                 adamw_betas=(0.9, 0.95),
-                 adamw_eps=1e-8,
-                 none_grad=True,
-                 debug=False,
-                 clip_config=None,
-                 warmup_step=5,
-                 chunk_size=-1,
-                 use_distributed_muon=False,
-                 small_param_numel_threshold=65536,
-                 expert_keys=None):
-        defaults = dict(
-            lr=lr,
-            weight_decay=weight_decay,
-            momentum=momentum,
-            nesterov=nesterov,
-            ns_steps=ns_steps,
-            adamw_betas=adamw_betas,
-            adamw_eps=adamw_eps,
-            none_grad=none_grad,
-            use_muon=True,
-        )
-        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
-        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
-        if isinstance(params, types.GeneratorType):
-            raise ValueError(error_message.format(idx=0) + instruction_code)
-        for _idx, param_group in enumerate(params):
-            if param_group.get("use_muon", None) is None:
-                raise ValueError(
-                    error_message.format(idx=_idx) + instruction_code)
-        super().__init__(params, defaults)
-        self.debug = debug
-        self.clip_config = clip_config if clip_config is not None else {
-            "q_indices": [],
-            "k_indices": [],
-            "head_dim": 128,
-            "threshold": 100,
-        }
-        self.warmup_step = warmup_step
-        self.chunk_size = chunk_size
-        self.use_distributed_muon = use_distributed_muon
-        self.small_param_numel_threshold = small_param_numel_threshold
-        self.expert_keys = expert_keys
-    def _calc_flops(self, G, steps):
-        assert len(G.shape) == 2
-        M, N = G.shape
-        if M > N:
-            M, N = N, M
-        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def get_shard_mesh(self, p):
-        """
-        Get the shard mesh for a parameter p on the given rank.
-        """
-        assert isinstance(
-            p, DTensor), "Parallel Muon only supports DTensor parameters."
-        shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
-            p.placements, p.device_mesh)
-        return shard_mesh, shard_pg, shard_placements
-    def init_state_and_assign_params(self, names, params, group, qk_logits):
-        param_to_state = {}
-        param_to_flops = {}
-        total_flops = 0
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            assert g.ndim == 2, "Muon only supports 2D parameters."
-            flops = self._calc_flops(g, group["ns_steps"])
-            param_to_flops[id(p)] = flops
-            total_flops += flops
-        if self.debug:
-            logger.debug("Total TFLOPs for Muon: %.2f TFLOPs",
-                         total_flops / 1e12)
-        paired = list(zip(names, params))
-        paired_sorted = sorted(paired,
-                               key=lambda x: param_to_flops[id(x[1])],
-                               reverse=True)
-        names_sorted, params_sorted = zip(*paired_sorted)
-        ordered_names = list(names_sorted)
-        ordered_params = list(params_sorted)
-        round_robin = 0
-        mesh = ordered_params[0].device_mesh
-        placements = ordered_params[0].placements
-        shard_mesh, shard_pg, shard_placements = self.get_shard_mesh(
-            ordered_params[0])
-        shard_mesh_flattened = shard_mesh.mesh.flatten()
-        num_ranks = dist.get_world_size(group=shard_pg)
-        for n, p in zip(ordered_names, ordered_params):
-            if mesh != p.device_mesh:
-                raise ValueError("All parameters must be on the same mesh.")
-            if placements != p.placements:
-                raise ValueError("All parameters must have same placements.")
-            worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
-            round_robin = (round_robin + 1) % len(shard_mesh_flattened)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            # Precompute per-rank indices and numels for all-to-all.
-            rank_indices: dict[int, tuple] = {}
-            rank_numels: dict[int, int] = {}
-            for r in range(num_ranks):
-                indices = get_slices_of_dtensor(p, r, shard_mesh,
-                                                shard_placements)
-                rank_indices[r] = indices
-                numel = 1
-                for idx, dim_size in zip(indices, p.shape):
-                    if isinstance(idx, slice):
-                        start, stop, step = idx.indices(dim_size)
-                        numel *= max(0, (stop - start + (step - 1)) // step)
-                    else:
-                        numel *= len(idx)
-                rank_numels[r] = numel
-            param_to_state[id(p)] = _muon_state(
-                worker_rank=worker_rank,
-                process_group=shard_pg,
-                rank_indices=rank_indices,
-                rank_numels=rank_numels,
-                name=n,
-                qk_clip_state=qk_clip_state,
-            )
-        return param_to_state, ordered_params
-    def base(self, names, params, group, lr, weight_decay, qk_logits):
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
-                                             steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-            update_p(p, u, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p, scales_full, qk_clip_state.head_dim)
-    def distributed_muon(
-        self,
-        names: list[str],
-        params: list[torch.nn.Parameter],
-        group: dict[str, Any],
-        lr: float,
-        weight_decay: float,
-        qk_logits: list[torch.Tensor | DTensor] | None,
-    ):
-        """ Implementation of Distributed Muon by Liu et al. """
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            # Gather G
-            if isinstance(p.data, DTensor):
-                g_full = g.full_tensor()
-                p_full = p.data.full_tensor()
-            else:
-                g_full = g
-                p_full = p
-            u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
-                                                  steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
-            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p_full, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
-            if isinstance(p.data, DTensor):
-                ndims = len(p.device_mesh.mesh.shape)
-                p_replicate = DTensor.from_local(
-                    p_full,
-                    device_mesh=p.device_mesh,
-                    placements=[Replicate() for _ in range(ndims)],
-                )
-                p_sharded = p_replicate.redistribute(
-                    device_mesh=p.device_mesh,
-                    placements=p.placements,
-                )
-                p.copy_(p_sharded)
-    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
-        """
-        Perform a parallel optimization step using Muon.
-        Parameters are chunked and each chunk is processed by a
-        :func:`muon_chunk_pipeline` generator.  :func:`run_pipeline`
-        interleaves multiple chunks so that communication and computation
-        overlap across chunks (the same overlap previously achieved by the
-        warmup + main-loop index scheduling).
-        """
-        # Momentum is already applied by _step_muon before this method.
-        param_to_state, ordered_params = self.init_state_and_assign_params(
-            names, params, group, qk_logits)
-        # Compute local rank for this group's shard process group.
-        shard_pg = param_to_state[id(ordered_params[0])].process_group
-        rank = dist.get_rank(group=shard_pg)
-        if self.chunk_size == -1:
-            shard_ranks = dist.get_world_size(param_to_state[id(
-                ordered_params[0])].process_group)
-            chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
-        elif self.chunk_size > 0:
-            chunk_size = self.chunk_size
-        else:
-            raise ValueError("chunk_size must be -1 or a positive integer.")
-        def pipelines():
-            for start in range(0, len(ordered_params), chunk_size):
-                chunk = ordered_params[start:start + chunk_size]
-                if chunk:
-                    yield muon_chunk_pipeline(
-                        params=chunk,
-                        param_to_state=param_to_state,
-                        rank=rank,
-                        ns_steps=group["ns_steps"],
-                        lr=lr,
-                        weight_decay=weight_decay,
-                        none_grad=group["none_grad"],
-                    )
-        with record_function("muon::barrier"):
-            dist.barrier()
-        with record_function("muon::pipeline"):
-            run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
-    def _step_muon(self, group, qk_logits=None):
-        params = group["params"]
-        lr = group["lr"]
-        weight_decay = group["weight_decay"]
-        momentum = group["momentum"]
-        names = group["names"]
-        # Apply momentum to all params before routing/expansion.
-        with record_function("muon::momentum"):
-            for n, p in zip(names, params):
-                g = p.grad
-                if g is None:
-                    continue
-                g = update_g(self.state, p, g, group, momentum)
-                p.grad = g
-        # Expand expert params by splitting on dim 0.
-        names, params = _expand_expert_params(names, params, self.expert_keys)
-        param_dtensors = []
-        name_dtensors = []
-        param_tensors = []
-        name_tensors = []
-        param_dtensors_small = []
-        name_dtensors_small = []
-        if self.use_distributed_muon:
-            self.distributed_muon(names=names,
-                                  params=params,
-                                  group=group,
-                                  lr=lr,
-                                  weight_decay=weight_decay,
-                                  qk_logits=qk_logits)
-            return
-        # For simplicity, we use distributed Muon for small parameters
-        # whose number of elements is below a threshold.
-        for n, p in zip(names, params):
-            if p is None or p.grad is None:
-                continue
-            if isinstance(p.data, DTensor):
-                if all(
-                        isinstance(placement, Replicate)
-                        for placement in p.placements):
-                    param_tensors.append(p)
-                    name_tensors.append(n)
-                elif p.data.numel() <= self.small_param_numel_threshold:
-                    param_dtensors_small.append(p)
-                    name_dtensors_small.append(n)
-                else:
-                    param_dtensors.append(p)
-                    name_dtensors.append(n)
-            elif isinstance(p.data, torch.Tensor):
-                param_tensors.append(p)
-                name_tensors.append(n)
-            else:
-                raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-        logger.debug(
-            f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors, "
-            f"{len(param_dtensors_small)} Small DTensors")
-        def group_dtensors(dtensors, names):
-            # To support different placements, we group parameters by placements
-            # and run parallel Muon on each group.
-            placement_to_params = defaultdict(lambda: ([], []))
-            assert len(dtensors) == len(names)
-            for p, n in zip(dtensors, names):
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][0].append(n)
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][1].append(p)
-            return placement_to_params
-        if len(param_dtensors_small) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            self.distributed_muon(
-                params=param_dtensors_small,
-                names=name_dtensors_small,
-                group=group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
-        if len(param_dtensors) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            dtensor_group = group_dtensors(param_dtensors, name_dtensors)
-            for _, (names, params) in dtensor_group.items():
-                self.parallel(
-                    names,
-                    params,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    qk_logits=qk_logits,
-                )
-        if len(param_tensors) > 0:
-            self.base(
-                name_tensors,
-                param_tensors,
-                group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
-    @torch.no_grad
-    def step(self, closure=None, qk_logits=None):
-        """Perform a single optimization step.
-        Args:
-            closure (Callable, optional): A closure that reevaluates the model
-                and returns the loss.
-            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
-                to 1D tensors of shape (num_heads,), representing the maximum
-                QK logits across all tokens, computed as
-                (1 / sqrt(head_dim)) * (Q @ K^T).
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        for group in self.param_groups:
-            if group["use_muon"]:
-                self._step_muon(group, qk_logits=qk_logits)
-            else:
-                step_adamw(self.state, group)
-        return loss

build/torch210-cxx11-cu128-x86_64-linux/newton_schulz.py DELETED Viewed

@@ -1,50 +0,0 @@
-import torch
-from .matmul_transpose_triton import matmul_transpose_assign
-COMM_DTYPE = torch.bfloat16
-DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
-@torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    assert G.dtype == COMM_DTYPE
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
-        matmul_transpose_assign(X, buf1)
-        matmul_transpose_assign(buf1, buf2)
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X

build/torch210-cxx11-cu128-x86_64-linux/optimizer/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu128-x86_64-linux/pipeline.py DELETED Viewed

@@ -1,390 +0,0 @@
-import logging
-from typing import Generator
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor
-from torch.profiler import record_function
-from .core import _muon_state, adjust_lr_for_muon, update_p
-from .newton_schulz import COMM_DTYPE, _zeropower_via_newtonschulz5
-from .qk_clip import compute_scales
-logger = logging.getLogger(__name__)
-# ======================================================================
-# Stage helpers
-# ======================================================================
-def _launch_gather(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
-    """Allocate gather buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_gather``).
-        gathered_grads: ``{id(p): empty_tensor}`` for owned params,
-            ``None`` for non-owned.
-        recv_counts: Per-source-rank element counts.
-    """
-    # Allocate gathered-grad buffers
-    gathered_grads: dict[int, torch.Tensor | None] = {}
-    for p in params:
-        state = param_to_state[id(p)]
-        if rank == state.worker_rank:
-            gathered_grads[id(p)] = torch.empty(p.shape,
-                                                dtype=COMM_DTYPE,
-                                                device="cuda")
-        else:
-            gathered_grads[id(p)] = None
-    # Build send buffer
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
-    send_counts = [0] * num_ranks
-    for p in params:
-        state = param_to_state[id(p)]
-        dst = state.worker_rank
-        assert dst < num_ranks
-        shard_elems = state.rank_numels[rank]
-        g = p.grad
-        g = g.to_local().to(COMM_DTYPE).contiguous()
-        assert g.numel() == shard_elems
-        per_dst[dst].append(g.view(-1))
-        send_counts[dst] += shard_elems
-    assert any(
-        len(v) > 0 for v in
-        per_dst), "At least one destination rank must receive a sharded tensor"
-    per_dst_flat = [t for dst in per_dst for t in dst]
-    send_buf = torch.cat(per_dst_flat, dim=0)
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            total += state.rank_numels[src]
-        recv_counts[src] = total
-    recv_buf = torch.empty(sum(recv_counts), dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    logger.debug(f"send_buf size: {send_buf.numel()}, "
-                 f"recv_buf size: {recv_buf.numel()}, "
-                 f"recv_counts: {recv_counts}, "
-                 f"send_counts: {send_counts}, "
-                 f"process_group: {str(process_group)}")
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, gathered_grads, recv_counts
-def _complete_gather(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-) -> None:
-    """Reconstruct gathered grads from the recv buffer (in-place)."""
-    off = 0
-    for src in range(len(recv_counts)):
-        if recv_counts[src] == 0:
-            continue
-        block = recv_counts[src]
-        inner_off = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            indices = state.rank_indices[src]
-            shard_view = gathered_grads[id(p)][indices]
-            n = shard_view.numel()
-            assert n > 0
-            sg = recv_buf.narrow(0, off + inner_off, n)
-            sg = sg.reshape(shard_view.shape)
-            gathered_grads[id(p)][indices] = sg
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _compute_ns(
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    ns_steps: int,
-) -> dict[int, torch.Tensor | None]:
-    """Run Newton-Schulz orthogonalization on owned parameters.
-    Returns:
-        computed_us: ``{id(p): orthogonalized_update}`` for owned params.
-    """
-    computed_us: dict[int, torch.Tensor | None] = {}
-    for p in owned_params:
-        u = _zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
-        gathered_grads[id(p)] = None  # free gathered grad
-        computed_us[id(p)] = u
-    return computed_us
-def _launch_scatter(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-    computed_us: dict[int, torch.Tensor | None],
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor], list[int]]:
-    """Allocate scatter buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
-        scattered_us: ``{id(p): empty_local_tensor}`` for all params.
-        recv_counts: Per-source-rank element counts.
-    """
-    # Allocate scattered-u buffers
-    scattered_us: dict[int, torch.Tensor] = {}
-    for p in params:
-        scattered_us[id(p)] = torch.empty_like(p.to_local(), dtype=COMM_DTYPE)
-    # Build send buffer (from computed_us on owner ranks)
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
-    send_counts = [0] * num_ranks
-    if owned_params:
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert computed_us[id(p)] is not None
-            u_full = computed_us[id(p)].to(COMM_DTYPE).contiguous()
-            total_sent = 0
-            for dst_rank in range(num_ranks):
-                indices = state.rank_indices[dst_rank]
-                su = u_full[indices].flatten()
-                n = su.numel()
-                assert n > 0
-                per_dst[dst_rank].append(su)
-                send_counts[dst_rank] += n
-                total_sent += n
-            assert total_sent == u_full.numel()
-    lengths = [len(v) for v in per_dst]
-    if all(l > 0 for l in lengths):
-        assert all(
-            l == lengths[0] for l in lengths
-        ), "All destination ranks must have the same number of sharded tensor"
-        per_dst_flat = [t for dst in per_dst for t in dst]
-        send_buf = torch.cat(per_dst_flat, dim=0)
-    else:
-        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            total += state.rank_numels[rank]
-        recv_counts[src] = total
-    recv_total = sum(recv_counts)
-    assert recv_total > 0
-    recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, scattered_us, recv_counts
-def _complete_scatter(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-) -> None:
-    """Copy recv buffer into scattered_us (in-place)."""
-    off = 0
-    for src in range(len(recv_counts)):
-        block = recv_counts[src]
-        if block == 0:
-            continue
-        inner_off = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            n = state.rank_numels[rank]
-            assert n > 0
-            flat_local = recv_buf.narrow(0, off + inner_off,
-                                         n).view_as(p.to_local())
-            scattered_us[id(p)].copy_(flat_local)
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _update_params(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-    lr: float,
-    weight_decay: float,
-) -> None:
-    """Apply weight decay, Muon update, and optional QK clipping."""
-    for p in params:
-        state = param_to_state[id(p)]
-        u_dtensor = DTensor.from_local(
-            scattered_us[id(p)],
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-        update_p(p, u_dtensor, lr, adjusted_lr, weight_decay)
-        # QK clipping – applied directly on the local tensor to
-        # avoid DTensor sharding-propagation issues with _StridedShard.
-        scales_full = compute_scales(
-            p,
-            state.qk_clip_state) if state.qk_clip_state is not None else None
-        if scales_full is not None:
-            ratio = p.shape[0] // scales_full.shape[0]
-            idx0 = state.rank_indices[rank][0]
-            if isinstance(idx0, slice):
-                start = idx0.start or 0
-                idx0 = torch.arange(start,
-                                    idx0.stop,
-                                    device=scales_full.device)
-            row_scales = scales_full[idx0 // ratio]
-            p._local_tensor.mul_(row_scales.view(-1, 1))
-# ======================================================================
-# Main generator – thin orchestrator that wires stages together.
-# ======================================================================
-@torch.no_grad()
-def muon_chunk_pipeline(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    ns_steps: int,
-    lr: float,
-    weight_decay: float,
-    none_grad: bool,
-) -> Generator[None, None, None]:
-    """Process one chunk of parameters through the full Muon pipeline.
-    Stages: gather -> compute (Newton-Schulz) -> scatter -> update.
-    Each ``yield`` lets :func:`run_pipeline` interleave other chunks so
-    that communication and computation overlap across chunks.  Async
-    communication is launched via ``async_op=True`` and completed after
-    the yield with ``work.wait()``.
-    Overlap happens because :func:`run_pipeline` admits one new chunk
-    per iteration (staggered admission).  While chunk *N* does NS
-    compute on the default CUDA stream, chunk *N+1*'s async all-to-all
-    runs concurrently on the NCCL stream — no separate ``comm_stream``
-    is required.
-    Yields exactly **2** times:
-    1. After launching async all-to-all gather.
-    2. After launching async all-to-all scatter.
-    """
-    process_group = param_to_state[id(params[0])].process_group
-    num_ranks = dist.get_world_size(group=process_group)
-    owned_params = [
-        p for p in params if param_to_state[id(p)].worker_rank == rank
-    ]
-    # Stages 1-2: launch async gather.
-    with record_function("muon::launch_gather"):
-        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group)
-        if none_grad:
-            for p in params:
-                p.grad = None
-    yield  # --- YIELD 1: other chunks can launch their gather ---
-    with record_function("muon::wait_gather"):
-        work.wait()
-        _complete_gather(recv_buf, recv_counts, owned_params, gathered_grads,
-                         param_to_state, rank)
-        del recv_buf
-    # Stage 3: Newton-Schulz orthogonalization.
-    with record_function("muon::newton_schulz"):
-        computed_us = _compute_ns(owned_params, gathered_grads, ns_steps)
-        gathered_grads.clear()
-    # Stages 4-5: launch async scatter.
-    with record_function("muon::launch_scatter"):
-        work, recv_buf, scattered_us, recv_counts = _launch_scatter(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group, computed_us)
-        computed_us.clear()
-    yield  # --- YIELD 2: other chunks can launch their scatter ---
-    with record_function("muon::wait_scatter"):
-        work.wait()
-        _complete_scatter(recv_buf, recv_counts, params, param_to_state, rank,
-                          scattered_us)
-        del recv_buf
-    # Stage 6: apply parameter updates.
-    with record_function("muon::update_params"):
-        _update_params(params, param_to_state, rank, scattered_us, lr,
-                       weight_decay)
-        scattered_us.clear()

build/torch210-cxx11-cu128-x86_64-linux/qk_clip.py DELETED Viewed

@@ -1,129 +0,0 @@
-import logging
-import math
-from dataclasses import dataclass
-import torch
-from torch.distributed.tensor import DTensor
-logger = logging.getLogger(__name__)
-def parse_qk_layer(name: str) -> tuple[str | None, int]:
-    """
-    Parse a parameter name to check if it is a query/key projection layer
-    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
-    Returns:
-        (kind, layer_idx) or (None, -1) if not matched.
-    Example:
-        'model.3.attn.wq.weight'      -> ('wq', 3)
-        'model.5.attn.wk.weight'      -> ('wk', 5)
-        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
-        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
-        'model.4.attn.v_proj.weight'  -> (None, -1)
-    """
-    parts = name.split('.')
-    if len(parts) < 3:
-        return None, -1
-    kind = parts[-2]
-    layer_idx = -1
-    for part in reversed(parts):
-        if part.isdigit():
-            layer_idx = int(part)
-            break
-    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
-        return kind, layer_idx
-    return None, -1
-@dataclass
-class QKClipInfo:
-    """Per-parameter dynamic info computed from config + runtime logits."""
-    kind: str | None  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
-    indices: list[int]  # which heads to consider for clipping
-    head_dim: int  # from config
-    threshold: float  # from config
-    logit: torch.Tensor | None
-def get_qk_clip_info(clip_config, n, qk_logits):
-    """Extract QK clipping info for a named parameter.
-    Args:
-        clip_config: QK clipping configuration dict (or None).
-        n: Parameter name string.
-        qk_logits: Dict mapping layer indices to logit tensors (or None).
-    Returns:
-        QKClipInfo instance with clipping configuration for this parameter.
-    """
-    if clip_config is None:
-        return None
-    head_dim = clip_config.get('head_dim')
-    threshold = clip_config.get('threshold')
-    kind, layer_idx = parse_qk_layer(n)
-    logit, indices = None, []
-    if qk_logits is not None and kind is not None:
-        logit = qk_logits[layer_idx]
-        indices_key = 'q_indices' if 'q' in kind else 'k_indices'
-        indices = clip_config.get(indices_key, []) or []
-        if isinstance(logit, DTensor):
-            # In TP settings, qk_logits may be DTensor
-            # We convert it to full tensor here for simplicity
-            logit = logit.full_tensor()
-    return QKClipInfo(
-        kind=kind,
-        indices=indices,
-        head_dim=head_dim,
-        threshold=threshold,
-        logit=logit,
-    )
-def compute_scales(p, qk_clip_state):
-    """Compute per-head scaling factors for QK clipping.
-    Returns scales tensor if any head exceeds threshold, else None.
-    """
-    kind = qk_clip_state.kind
-    indices = qk_clip_state.indices
-    head_dim = qk_clip_state.head_dim
-    threshold = qk_clip_state.threshold
-    logit = qk_clip_state.logit
-    H_global = p.shape[0] // head_dim
-    scales_full = torch.ones(H_global, device=p.data.device)
-    scaling = 0
-    for logit_idx, head_idx in enumerate(indices):
-        v_ele = float(logit[logit_idx])
-        if v_ele > threshold:
-            new_scale = math.sqrt(threshold / v_ele)
-            if new_scale < scales_full[head_idx]:
-                scales_full[head_idx] = new_scale
-                logger.info(
-                    f"[{kind}] Head {head_idx} exceeded threshold "
-                    f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
-                )
-                scaling += 1
-    return scales_full if scaling > 0 else None
-def qk_clip(p, scales, head_dim):
-    """Apply per-head scaling to a Q/K projection weight matrix."""
-    if isinstance(p, torch.nn.Parameter):
-        W = p.data.view(-1, head_dim, p.data.shape[1])
-        W.mul_(scales.view(-1, 1, 1))
-    else:
-        W = p.view(-1, head_dim, p.shape[1])
-        W.mul_(scales.view(-1, 1, 1))

build/torch210-cxx11-cu130-x86_64-linux/adamw.py DELETED Viewed

@@ -1,154 +0,0 @@
-from collections import defaultdict
-from typing import cast
-import torch
-from torch.distributed.tensor import DTensor
-def fused_adamw(
-    params: list[torch.Tensor],
-    grads: list[torch.Tensor],
-    exp_avgs: list[torch.Tensor],
-    exp_avg_sqs: list[torch.Tensor],
-    max_exp_avg_sqs: list[torch.Tensor],
-    state_steps: list[torch.Tensor],
-    amsgrad: bool,
-    beta1: float,
-    beta2: float,
-    lr: float | torch.Tensor,
-    weight_decay: float,
-    eps: float,
-    maximize: bool,
-) -> None:
-    if not params:
-        return
-    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-    # treating it as a scalar.
-    lr_dict: dict | None = ({
-        lr.device: lr
-    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
-    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-         state_steps]  # type: ignore[list-item]
-    )
-    for (device, _), (
-        (
-            device_params_,
-            device_grads_,
-            device_exp_avgs_,
-            device_exp_avg_sqs_,
-            device_max_exp_avg_sqs,
-            device_state_steps_,
-        ),
-            _,
-    ) in grouped_tensors.items():
-        device_params = cast(list[torch.Tensor], device_params_)
-        device_grads = cast(list[torch.Tensor], device_grads_)
-        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-        if lr_dict is not None and device not in lr_dict:
-            lr_dict[device] = lr.to(
-                device=device, non_blocking=True)  # type: ignore[union-attr]
-            lr = lr_dict[device]
-        torch._foreach_add_(device_state_steps, 1)
-        func = torch._fused_adamw_
-        func(
-            device_params,
-            device_grads,
-            device_exp_avgs,
-            device_exp_avg_sqs,
-            device_max_exp_avg_sqs,  # type: ignore[arg-type]
-            device_state_steps,
-            amsgrad=amsgrad,
-            lr=lr,  # type: ignore[arg-type]
-            beta1=beta1,
-            beta2=beta2,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=maximize,
-        )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
-    params_with_grads = []
-    grads = []
-    moment1 = []
-    moment2 = []
-    max_exp_avg_sqs = []
-    state_steps = []
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    for p in params:
-        g = p.grad
-        if g is None:
-            continue
-        state = optimizer_state[p]
-        params_with_grads.append(p)
-        grads.append(g)
-        if "step" not in state:
-            state["step"] = (torch.zeros((),
-                                         dtype=torch.float32,
-                                         device=p.device))
-            state["moment1"] = torch.zeros_like(g)
-            state["moment2"] = torch.zeros_like(g)
-        moment1.append(state["moment1"])
-        moment2.append(state["moment2"])
-        if not isinstance(state["step"], torch.Tensor):
-            step_tensor = torch.tensor(state["step"],
-                                       dtype=torch.float32,
-                                       device=p.device)
-        else:
-            step_tensor = state["step"]
-        state_steps.append(step_tensor)
-    fused_adamw(
-        params_with_grads,
-        grads,
-        moment1,
-        moment2,
-        max_exp_avg_sqs,
-        state_steps,
-        amsgrad=False,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        eps=eps,
-        maximize=False,
-    )
-def step_adamw(optimizer_state, group):
-    """Dispatch AdamW step, grouping parameters by type and placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        group: Parameter group dict.
-    """
-    params = group["params"]
-    # group params with its type and placement
-    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
-    for p in params:
-        match p:
-            case DTensor():
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])].append(p)
-            case torch.Tensor():
-                placement_to_params[tuple([torch.Tensor, None])].append(p)
-    for group_params in placement_to_params.values():
-        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu130-x86_64-linux/async_utils.py DELETED Viewed

@@ -1,77 +0,0 @@
-import logging
-from typing import Generator
-logger = logging.getLogger(__name__)
-class _Task:
-    """Internal: wraps a generator, advances one yield at a time."""
-    def __init__(self, generator: Generator[None, None, None], index: int):
-        self._generator = generator
-        self._index = index
-        self._steps_completed = 0
-        self.step()  # run to first yield
-    def step(self) -> bool:
-        try:
-            next(self._generator)
-            self._steps_completed += 1
-            logger.debug("pipeline[%d] completed stage %d", self._index,
-                         self._steps_completed)
-            return True
-        except StopIteration:
-            logger.debug("pipeline[%d] finished after %d stages", self._index,
-                         self._steps_completed)
-            return False
-    def close(self):
-        self._generator.close()
-def run_pipeline(
-    pipelines: Generator[Generator[None, None, None], None, None],
-    max_concurrent: int,
-) -> None:
-    """Run generator-based pipelines with bounded concurrency.
-    Each pipeline is a generator that yields at stage boundaries.
-    The runtime interleaves pipelines so communication and computation
-    overlap across chunks.
-    """
-    if max_concurrent <= 0:
-        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
-    have_new = True
-    task_index = 0
-    previous_tasks: list[_Task] = []
-    try:
-        while have_new or previous_tasks:
-            running_tasks: list[_Task] = []
-            # Admit one new pipeline per iteration (staggered admission).
-            # Admitting one at a time ensures that while chunk N does NS
-            # compute on the default stream, chunk N+1's NCCL all-to-all
-            # runs concurrently on the NCCL stream — creating real
-            # communication/computation overlap on the GPU.
-            if have_new and len(previous_tasks) < max_concurrent:
-                try:
-                    gen = next(pipelines)
-                    task = _Task(gen, task_index)
-                    task_index += 1
-                    running_tasks.append(task)
-                except StopIteration:
-                    have_new = False
-            # Advance every previously-yielded task by one step.
-            for task in previous_tasks:
-                if task.step():
-                    running_tasks.append(task)
-            previous_tasks = running_tasks
-    except BaseException:
-        # Clean up all in-flight generators to release GPU resources.
-        for task in previous_tasks:
-            task.close()
-        raise

build/torch210-cxx11-cu130-x86_64-linux/core.py DELETED Viewed

@@ -1,116 +0,0 @@
-import math
-from dataclasses import dataclass
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.tensor import DTensor
-@dataclass
-class _muon_state:
-    worker_rank: int
-    process_group: ProcessGroup
-    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
-    rank_numels: dict[int, int]  # local_rank -> numel
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-def update_g(optimizer_state, p, g, group, momentum):
-    """Apply momentum update to gradient.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        p: Parameter tensor.
-        g: Gradient tensor.
-        group: Parameter group dict.
-        momentum: Momentum coefficient.
-    Returns:
-        Momentum-updated gradient tensor.
-    """
-    state = optimizer_state[p]
-    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-    torch.add(g, buf, alpha=momentum, out=buf)
-    if group["nesterov"]:
-        g.add_(buf, alpha=momentum)
-        return g
-    return buf
-def update_p(p, u, lr, adjusted_lr, weight_decay):
-    """Apply weight decay and orthogonalized update to parameter.
-    Args:
-        p: Parameter (torch.nn.Parameter or DTensor).
-        u: Orthogonalized update tensor.
-        lr: Base learning rate.
-        adjusted_lr: Size-adjusted learning rate.
-        weight_decay: Weight decay coefficient.
-    """
-    if isinstance(p, torch.nn.Parameter):
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    else:
-        p.mul_(1 - lr * weight_decay)
-        p.add_(u, alpha=-adjusted_lr)
-def adjust_lr_for_muon(lr, param_shape):
-    """Scale learning rate based on parameter matrix dimensions.
-    Args:
-        lr: Base learning rate.
-        param_shape: Shape of the parameter tensor.
-    Returns:
-        Adjusted learning rate.
-    """
-    A, B = param_shape[:2]
-    # We adjust the learning rate and weight decay based on the size of the parameter matrix
-    # as described in the paper
-    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-    adjusted_lr = lr * adjusted_ratio
-    return adjusted_lr
-def default_is_muon(name, x, expert_keys=None):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    if any(key in name for key in skip_keys):
-        return False
-    effective_ndim = x.ndim
-    if expert_keys and any(key in name for key in expert_keys):
-        effective_ndim -= 1
-    return effective_ndim >= 2
-def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
-    if is_muon_func is None:
-        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
-    muon_params, muon_names = [], []
-    non_muon_params = []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
-            continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]

build/torch210-cxx11-cu130-x86_64-linux/distributed/utils.py DELETED Viewed

@@ -1,234 +0,0 @@
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import DTensor
-from torch.distributed.tensor.placement_types import (Placement, Shard,
-                                                      _StridedShard)
-def _is_shard(placement: Placement) -> bool:
-    """Check if a placement is a shard type (Shard or _StridedShard).
-    In PyTorch 2.10+, _StridedShard no longer inherits from Shard, so
-    ``placement.is_shard()`` returns False for _StridedShard.  This helper
-    handles both old and new hierarchies.
-    """
-    return isinstance(placement, (Shard, _StridedShard))
-def get_slices_of_dtensor(
-    target: DTensor | torch.Tensor,
-    local_rank: int,
-    shard_mesh: DeviceMesh,
-    shard_placements: tuple[Placement],
-) -> tuple[slice | torch.Tensor, ...]:
-    """
-    Get per-dimension indices for a given rank's shard of the target tensor.
-    Uses ``Shard.local_shard_size_and_offset`` and
-    ``_StridedShard.local_shard_size_and_offset`` for correct handling of
-    both contiguous and strided (non-contiguous) sharding.
-    Args:
-        target (DTensor | torch.Tensor): The target tensor (for its shape).
-        local_rank (int): The local rank within the shard group.
-        shard_mesh (DeviceMesh): The shard mesh (only shard dimensions).
-        shard_placements (tuple[Placement]): The shard placements.
-    Returns:
-        A tuple of indices (one per tensor dim).  Each element is either:
-        - A ``slice`` (for contiguous or unsharded dims)
-        - A 1-D ``torch.LongTensor`` of indices (for strided sharding)
-    """
-    # find the global rank of the local rank in the shard mesh
-    rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
-    rank_coords = (shard_mesh.mesh == rank).nonzero()
-    assert len(rank_coords) == 1
-    rank_coords = tuple(rank_coords[0].tolist())
-    assert len(rank_coords) == len(shard_placements)
-    # Track per-shard-dim indices.
-    # None means "not yet sharded on this dim".
-    dim_indices: dict[int, torch.Tensor] = {}
-    # Caution: Assuming replicate-to-shard of the shard mesh goes with
-    # left-to-right sharding. This is ensured by the sorting logic of
-    # construct_shard_mesh function.
-    for mesh_dim_idx, (rank_coord, placement) in enumerate(
-            zip(rank_coords, shard_placements)):
-        assert _is_shard(placement)
-        num_chunks = shard_mesh.mesh.shape[mesh_dim_idx]
-        shard_dim = placement.dim
-        # Current effective size on this dim (may already be sub-sharded)
-        if shard_dim in dim_indices:
-            curr_size = len(dim_indices[shard_dim])
-        else:
-            curr_size = target.size()[shard_dim]
-        if curr_size % num_chunks != 0:
-            raise NotImplementedError(
-                f"Dimension size {curr_size} is not divisible "
-                f"by number of ranks {num_chunks} for shard "
-                f"placement on dim {shard_dim}. (shape: {target.shape})")
-        # Compute indices for this level of sharding
-        if isinstance(placement, _StridedShard):
-            _shard_size, offsets = _StridedShard.local_shard_size_and_offset(
-                placement,
-                curr_size,
-                num_chunks,
-                rank_coord,
-                return_first_offset=False)
-            new_indices = torch.tensor(offsets, dtype=torch.long)
-        else:
-            shard_size, offset = Shard.local_shard_size_and_offset(
-                curr_size, num_chunks, rank_coord)
-            new_indices = torch.arange(offset,
-                                       offset + shard_size,
-                                       dtype=torch.long)
-        # Compose with previous indices on this dim
-        if shard_dim in dim_indices:
-            dim_indices[shard_dim] = dim_indices[shard_dim][new_indices]
-        else:
-            dim_indices[shard_dim] = new_indices
-    # Build result tuple
-    result: list[slice | torch.Tensor] = []
-    for d in range(len(target.size())):
-        if d not in dim_indices:
-            result.append(slice(None))
-        else:
-            indices = dim_indices[d]
-            # Convert contiguous indices to slice for efficiency
-            if len(indices) > 0:
-                start = indices[0].item()
-                expected = torch.arange(start,
-                                        start + len(indices),
-                                        dtype=torch.long)
-                if torch.equal(indices, expected):
-                    result.append(slice(start, start + len(indices)))
-                else:
-                    result.append(indices)
-            else:
-                result.append(slice(0, 0))
-    return tuple(result)
-_ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
-                                                  ProcessGroup]] = dict()
-def construct_shard_mesh(
-    placements: tuple[Placement],
-    mesh: DeviceMesh,
-) -> tuple[DeviceMesh, ProcessGroup, tuple[Placement, ...]]:
-    """Construct shard sub-mesh and ProcessGroup for all-to-all communication.
-    Given a DTensor's placements and device mesh, extracts the "shard group"
-    — the set of ranks that together hold all shards of the same replica —
-    and creates a ProcessGroup for all-to-all among them.
-    Steps:
-        1. Sort placements: Replicate first, then Shard by (dim, granularity).
-        2. Permute the mesh tensor to match the sorted order.
-        3. Collapse Replicate dims → list of shard sub-meshes (one per replica).
-        4. Create/retrieve a cached ProcessGroup for the current rank's sub-mesh.
-    Example — 8 GPUs, mesh shape (2, 2, 2),
-              placements ``[Shard(0), Replicate, _StridedShard(0)]``::
-        Step 1 — Sort: [Replicate, _StridedShard(0), Shard(0)]
-                 Permutation: [1, 2, 0]
-        Step 2 — Permute mesh dims by [1, 2, 0]:
-                 Original:                Permuted:
-                 [[[0,1],[2,3]],          [[[0,2],[1,3]],
-                  [[4,5],[6,7]]]           [[4,6],[5,7]]]
-        Step 3 — Unbind replicate dim (dim 0), giving 2 shard sub-meshes:
-                 sub-mesh 0 = [[0,2],[1,3]]  (replica group 0)
-                 sub-mesh 1 = [[4,6],[5,7]]  (replica group 1)
-                 shard_placements = (_StridedShard(0), Shard(0))
-        Step 4 — Rank 0 → ProcessGroup([0,1,4,5])
-                 Rank 2 → ProcessGroup([2,3,6,7])
-    Returns:
-        ``(shard_mesh, process_group, shard_placements)``
-    """
-    my_rank = dist.get_rank()
-    assert mesh.mesh.device.type == 'cpu'
-    # -- Fast path: 1D all-shard mesh → reuse existing PG. ----------------
-    # This avoids a non-collective dist.new_group() call, which would
-    # deadlock when only a subset of ranks call this function (e.g. expert
-    # DTensors on a TP submesh where ranks 0-3 and 4-7 call separately).
-    if mesh.ndim == 1 and len(placements) == 1 and _is_shard(placements[0]):
-        key = (*mesh.mesh.shape, *mesh.mesh.flatten().tolist())
-        if key not in _ranks_to_dist_cache:
-            _ranks_to_dist_cache[key] = (mesh, mesh.get_group())
-        return (*_ranks_to_dist_cache[key], tuple(placements))
-    mesh_tensor = mesh.mesh.clone()
-    # -- Step 1: Sort placements (Replicate first, then Shard by dim). ------
-    # _StridedShard comes BEFORE regular Shard on the same dim so that
-    # get_slices_of_dtensor applies the outer sharding first, matching
-    # DTensor's left-to-right (outer-to-inner) composition order.
-    def _sort_key(item):
-        index, placement = item
-        assert not placement.is_partial(), "Partial placement not supported"
-        if placement.is_replicate():
-            return (-1, 0, index)
-        assert _is_shard(placement), f"Unsupported: {type(placement)}"
-        split = (-1 / placement.split_factor if isinstance(
-            placement, _StridedShard) else 0)
-        return (placement.dim, split, index)
-    indexed = sorted(enumerate(placements), key=_sort_key)
-    perm, sorted_placements = zip(*indexed)
-    # -- Step 2: Permute mesh to match sorted placement order. --------------
-    sorted_mesh = mesh_tensor.permute(perm)
-    # -- Step 3: Collapse replicate dims → list of shard sub-meshes. --------
-    # E.g. mesh (2, 3, 4, 4) with [R, R, S(0), S(1)] → 6 sub-meshes of (4, 4)
-    num_rep = sum(1 for p in sorted_placements if p.is_replicate())
-    if num_rep > 0:
-        if num_rep > 1:
-            sorted_mesh = sorted_mesh.flatten(0, num_rep - 1)
-        shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
-    else:
-        shard_meshes = [sorted_mesh]
-    shard_placements = sorted_placements[num_rep:]
-    assert len(shard_placements) == len(set(shard_placements))
-    # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
-    # All ranks must call dist.new_group in the same order, even though each
-    # rank only joins one group.
-    def _cache_key(t: torch.Tensor) -> tuple:
-        return (*t.shape, *t.flatten().tolist())
-    my_key = None
-    for sm in shard_meshes:
-        key = _cache_key(sm)
-        if (my_rank == sm).any().item():
-            assert my_key is None, "Rank appears in multiple shard groups"
-            my_key = key
-        if key not in _ranks_to_dist_cache:
-            pg = dist.new_group(sm.flatten().tolist())
-            _ranks_to_dist_cache[key] = (
-                DeviceMesh(device_type="cuda", mesh=sm),
-                pg,
-            )
-    return (*_ranks_to_dist_cache[my_key], shard_placements)

build/torch210-cxx11-cu130-x86_64-linux/matmul_transpose_triton.py DELETED Viewed

@@ -1,121 +0,0 @@
-# MIT License
-#
-# Copyright (c) 2025 Tianyang Lin
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import torch
-import triton
-import triton.language as tl
-def get_autotune_config():
-    return [
-        triton.Config(
-            {
-                'BLOCK_SIZE_M': blk_m,
-                'BLOCK_SIZE_K': blk_k,
-                'GROUP_SIZE_M': grp_sz
-            },
-            num_stages=n_stages,
-            num_warps=n_warps) for blk_m in [32, 64, 128]
-        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
-        for n_warps in [4, 8]
-    ]
-@triton.autotune(
-    configs=get_autotune_config(),
-    key=['M', 'K'],
-)
-@triton.jit
-def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
-               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-               GROUP_SIZE_M: tl.constexpr):
-    """
-    Core kernel jit function of matmul_transpose that computes y = x @ x.T
-    The code is a simple adaptation from the triton `matmul` tutorial:
-    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
-    """
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    if pid_m > pid_n:
-        return
-    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    # we use a & b ptrs to denote different rows of x.
-    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl.load(a_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
-        a_ptrs += BLOCK_SIZE_K * stride_xk
-        b_ptrs += BLOCK_SIZE_K * stride_xk
-    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
-    # https://github.com/triton-lang/triton/issues/2252
-    c = accumulator.to(x.dtype.element_ty)
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
-    tl.store(c_ptrs, c, mask=c_mask)
-    # transpose and copy
-    if pid_m < pid_n:
-        ct_ptrs = y + stride_ym * offs_cn[:,
-                                          None] + stride_yn * offs_cm[None, :]
-        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
-        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-def matmul_transpose_assign(d_in, d_out):
-    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
-    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
-    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
-    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
-    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
-    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
-    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
-            "First dimension of `d_in` must match first and second dimension of `d_out`"
-    d_in = d_in.contiguous()
-    M, K = d_in.shape
-    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
-        M, META['BLOCK_SIZE_M']), )
-    with torch.cuda.device(d_in.device.index):
-        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
-                         d_out.stride(0), d_out.stride(1))

build/torch210-cxx11-cu130-x86_64-linux/metadata.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "python-depends": []
-}

build/torch210-cxx11-cu130-x86_64-linux/muon.py DELETED Viewed

@@ -1,594 +0,0 @@
-import logging
-import types
-from collections import defaultdict
-from typing import Any
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor, Replicate, Shard
-from torch.profiler import record_function
-from .adamw import step_adamw
-from .async_utils import run_pipeline
-from .core import (_muon_state, adjust_lr_for_muon,
-                   get_default_muon_param_groups, update_g, update_p)
-from .distributed.utils import (_is_shard, construct_shard_mesh,
-                                get_slices_of_dtensor)
-from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
-                            _zeropower_via_newtonschulz5)
-from .pipeline import muon_chunk_pipeline
-from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
-logger = logging.getLogger(__name__)
-def _expand_expert_params(names, params, expert_keys):
-    """Expand expert params by splitting on dim 0 (expert dimension).
-    Params whose name matches any key in ``expert_keys`` are treated as
-    expert-parallel tensors.  Their outermost dimension is the expert
-    dimension: an ``(E, out, in)`` tensor becomes ``E`` separate 2D
-    ``nn.Parameter`` views so that in-place updates propagate back to
-    the original storage.
-    Non-expert params with ``ndim > 2`` trigger an ``AssertionError`` —
-    if they are expert params, their key must be added to ``expert_keys``.
-    The grad must already be set on each expert param (e.g. after momentum).
-    For DTensor expert params, placements that shard on dim 0 (expert dim)
-    are consumed by the split.  Non-dim-0 shard placements (e.g. TP) are
-    preserved: each 2D slice is wrapped as a DTensor on the corresponding
-    submesh so the parallel pipeline handles the TP communication.
-    """
-    expanded_names = []
-    expanded_params = []
-    for n, p in zip(names, params):
-        is_expert = expert_keys and any(key in n for key in expert_keys)
-        is_dtensor = isinstance(p.data, DTensor)
-        if not is_expert:
-            assert p.data.ndim <= 2, (
-                f"Param {n} has ndim={p.data.ndim} but does not match "
-                f"expert_keys={expert_keys}. If this is an expert param, "
-                f"add its key to expert_keys.")
-            expanded_names.append(n)
-            expanded_params.append(p)
-            continue
-        g = p.grad
-        assert g is not None, (
-            f"Expert param {n} must have grad set before expansion")
-        tp_mesh = None
-        tp_placements_2d = None
-        if is_dtensor:
-            local_data = p.to_local()
-            local_grad = g.to_local() if isinstance(g, DTensor) else g
-            # Find non-dim-0 shard placements (e.g. TP sharding).
-            # After splitting on dim 0, Shard(k) becomes Shard(k-1).
-            tp_dim_indices = []
-            tp_placements_2d = []
-            for i, pl in enumerate(p.placements):
-                if _is_shard(pl) and pl.dim != 0:
-                    tp_dim_indices.append(i)
-                    tp_placements_2d.append(Shard(pl.dim - 1))
-            if tp_dim_indices:
-                tp_dim_names = tuple(p.device_mesh.mesh_dim_names[i]
-                                     for i in tp_dim_indices)
-                if len(tp_dim_names) == 1:
-                    tp_mesh = p.device_mesh[tp_dim_names[0]]
-                else:
-                    tp_mesh = p.device_mesh[tp_dim_names]
-        else:
-            local_data = p.data
-            local_grad = g
-        # Expand: split dim 0, reshape each slice to 2D.
-        num_local_experts = local_data.shape[0]
-        for i in range(num_local_experts):
-            slice_data = local_data[i]
-            slice_grad = local_grad[i]
-            if tp_mesh is not None:
-                # Wrap as DTensor on TP submesh so the pipeline handles
-                # TP communication (gather/scatter across TP ranks).
-                dt_data = DTensor.from_local(slice_data,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                dt_grad = DTensor.from_local(slice_grad,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                expert_param = torch.nn.Parameter(dt_data, requires_grad=False)
-                expert_param.grad = dt_grad
-            else:
-                expert_param = torch.nn.Parameter(slice_data,
-                                                  requires_grad=False)
-                expert_param.grad = slice_grad
-            expanded_names.append(f"{n}[{i}]")
-            expanded_params.append(expert_param)
-        p.grad = None  # allow expert grad storage to be freed after pipeline
-    return expanded_names, expanded_params
-class Muon(torch.optim.Optimizer):
-    """
-    Muon - MomentUm Orthogonalized by Newton-schulz
-    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
-    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
-    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
-    the advantage that it can be stably run in bfloat16 on the GPU.
-    Some warnings:
-    - We believe this optimizer is unlikely to work well for training with small batch size.
-    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
-    Arguments:
-        model: The model to be optimized by Muon.
-        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
-        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
-        momentum: The momentum used by the internal SGD. (0.95 is a good default)
-        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
-        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        weight_decay: The weight decay for Muon and AdamW.
-            Parameters that are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW instead.
-        adamw_lr: The learning rate for the internal AdamW.
-        adamw_betas: The betas for the internal AdamW.
-        adamw_eps: The epsilon for the internal AdamW.
-        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
-        debug: Whether to print debug information.
-        clip_info : Configuration for QK clipping. Expected keys:
-            - "q_indices" (list[int]): Indices of query heads to consider.
-            - "k_indices" (list[int]): Indices of key heads to consider.
-            - "head_dim" (int): Dimensionality of each attention head.
-            - "threshold" (float): Threshold value; heads whose QK logits exceed
-            this value will be scaled down.
-            Default is:
-                {
-                    "q_indices": [],
-                    "k_indices": [],
-                    "head_dim": 128,
-                    "threshold": 100
-                }
-        warmup_step : How many all2all gather, compute operations are launched in advance
-                      before the corresponding all2all scatter steps begin.
-                      A higher warmup_step increases memory usage but can improve
-                      performance by overlapping communication.
-                      Parallel muon only.
-        chunk_size : Batch size of parameters to process in each
-                     all2all gather/compute/scatter step.
-                     Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
-        use_distributed_muon: Use distributed muon by Liu et al. (2024).
-                              For testing purpose only.
-        small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
-        expert_keys: List of strings to identify expert-parallel parameters.
-                     If any key appears in a parameter's name, its outermost
-                     dimension is treated as the expert dimension and expanded
-                     into per-expert 2D params for Muon.  For example,
-                     ``expert_keys=["experts"]`` matches any param whose name
-                     contains "experts".  3D+ params not matched by any key
-                     will raise an error.
-    """
-    def __init__(self,
-                 params,
-                 lr=1e-3,
-                 momentum=0.95,
-                 nesterov=True,
-                 ns_steps=5,
-                 weight_decay=0.1,
-                 adamw_betas=(0.9, 0.95),
-                 adamw_eps=1e-8,
-                 none_grad=True,
-                 debug=False,
-                 clip_config=None,
-                 warmup_step=5,
-                 chunk_size=-1,
-                 use_distributed_muon=False,
-                 small_param_numel_threshold=65536,
-                 expert_keys=None):
-        defaults = dict(
-            lr=lr,
-            weight_decay=weight_decay,
-            momentum=momentum,
-            nesterov=nesterov,
-            ns_steps=ns_steps,
-            adamw_betas=adamw_betas,
-            adamw_eps=adamw_eps,
-            none_grad=none_grad,
-            use_muon=True,
-        )
-        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
-        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
-        if isinstance(params, types.GeneratorType):
-            raise ValueError(error_message.format(idx=0) + instruction_code)
-        for _idx, param_group in enumerate(params):
-            if param_group.get("use_muon", None) is None:
-                raise ValueError(
-                    error_message.format(idx=_idx) + instruction_code)
-        super().__init__(params, defaults)
-        self.debug = debug
-        self.clip_config = clip_config if clip_config is not None else {
-            "q_indices": [],
-            "k_indices": [],
-            "head_dim": 128,
-            "threshold": 100,
-        }
-        self.warmup_step = warmup_step
-        self.chunk_size = chunk_size
-        self.use_distributed_muon = use_distributed_muon
-        self.small_param_numel_threshold = small_param_numel_threshold
-        self.expert_keys = expert_keys
-    def _calc_flops(self, G, steps):
-        assert len(G.shape) == 2
-        M, N = G.shape
-        if M > N:
-            M, N = N, M
-        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def get_shard_mesh(self, p):
-        """
-        Get the shard mesh for a parameter p on the given rank.
-        """
-        assert isinstance(
-            p, DTensor), "Parallel Muon only supports DTensor parameters."
-        shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
-            p.placements, p.device_mesh)
-        return shard_mesh, shard_pg, shard_placements
-    def init_state_and_assign_params(self, names, params, group, qk_logits):
-        param_to_state = {}
-        param_to_flops = {}
-        total_flops = 0
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            assert g.ndim == 2, "Muon only supports 2D parameters."
-            flops = self._calc_flops(g, group["ns_steps"])
-            param_to_flops[id(p)] = flops
-            total_flops += flops
-        if self.debug:
-            logger.debug("Total TFLOPs for Muon: %.2f TFLOPs",
-                         total_flops / 1e12)
-        paired = list(zip(names, params))
-        paired_sorted = sorted(paired,
-                               key=lambda x: param_to_flops[id(x[1])],
-                               reverse=True)
-        names_sorted, params_sorted = zip(*paired_sorted)
-        ordered_names = list(names_sorted)
-        ordered_params = list(params_sorted)
-        round_robin = 0
-        mesh = ordered_params[0].device_mesh
-        placements = ordered_params[0].placements
-        shard_mesh, shard_pg, shard_placements = self.get_shard_mesh(
-            ordered_params[0])
-        shard_mesh_flattened = shard_mesh.mesh.flatten()
-        num_ranks = dist.get_world_size(group=shard_pg)
-        for n, p in zip(ordered_names, ordered_params):
-            if mesh != p.device_mesh:
-                raise ValueError("All parameters must be on the same mesh.")
-            if placements != p.placements:
-                raise ValueError("All parameters must have same placements.")
-            worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
-            round_robin = (round_robin + 1) % len(shard_mesh_flattened)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            # Precompute per-rank indices and numels for all-to-all.
-            rank_indices: dict[int, tuple] = {}
-            rank_numels: dict[int, int] = {}
-            for r in range(num_ranks):
-                indices = get_slices_of_dtensor(p, r, shard_mesh,
-                                                shard_placements)
-                rank_indices[r] = indices
-                numel = 1
-                for idx, dim_size in zip(indices, p.shape):
-                    if isinstance(idx, slice):
-                        start, stop, step = idx.indices(dim_size)
-                        numel *= max(0, (stop - start + (step - 1)) // step)
-                    else:
-                        numel *= len(idx)
-                rank_numels[r] = numel
-            param_to_state[id(p)] = _muon_state(
-                worker_rank=worker_rank,
-                process_group=shard_pg,
-                rank_indices=rank_indices,
-                rank_numels=rank_numels,
-                name=n,
-                qk_clip_state=qk_clip_state,
-            )
-        return param_to_state, ordered_params
-    def base(self, names, params, group, lr, weight_decay, qk_logits):
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
-                                             steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-            update_p(p, u, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p, scales_full, qk_clip_state.head_dim)
-    def distributed_muon(
-        self,
-        names: list[str],
-        params: list[torch.nn.Parameter],
-        group: dict[str, Any],
-        lr: float,
-        weight_decay: float,
-        qk_logits: list[torch.Tensor | DTensor] | None,
-    ):
-        """ Implementation of Distributed Muon by Liu et al. """
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            # Gather G
-            if isinstance(p.data, DTensor):
-                g_full = g.full_tensor()
-                p_full = p.data.full_tensor()
-            else:
-                g_full = g
-                p_full = p
-            u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
-                                                  steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
-            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p_full, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
-            if isinstance(p.data, DTensor):
-                ndims = len(p.device_mesh.mesh.shape)
-                p_replicate = DTensor.from_local(
-                    p_full,
-                    device_mesh=p.device_mesh,
-                    placements=[Replicate() for _ in range(ndims)],
-                )
-                p_sharded = p_replicate.redistribute(
-                    device_mesh=p.device_mesh,
-                    placements=p.placements,
-                )
-                p.copy_(p_sharded)
-    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
-        """
-        Perform a parallel optimization step using Muon.
-        Parameters are chunked and each chunk is processed by a
-        :func:`muon_chunk_pipeline` generator.  :func:`run_pipeline`
-        interleaves multiple chunks so that communication and computation
-        overlap across chunks (the same overlap previously achieved by the
-        warmup + main-loop index scheduling).
-        """
-        # Momentum is already applied by _step_muon before this method.
-        param_to_state, ordered_params = self.init_state_and_assign_params(
-            names, params, group, qk_logits)
-        # Compute local rank for this group's shard process group.
-        shard_pg = param_to_state[id(ordered_params[0])].process_group
-        rank = dist.get_rank(group=shard_pg)
-        if self.chunk_size == -1:
-            shard_ranks = dist.get_world_size(param_to_state[id(
-                ordered_params[0])].process_group)
-            chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
-        elif self.chunk_size > 0:
-            chunk_size = self.chunk_size
-        else:
-            raise ValueError("chunk_size must be -1 or a positive integer.")
-        def pipelines():
-            for start in range(0, len(ordered_params), chunk_size):
-                chunk = ordered_params[start:start + chunk_size]
-                if chunk:
-                    yield muon_chunk_pipeline(
-                        params=chunk,
-                        param_to_state=param_to_state,
-                        rank=rank,
-                        ns_steps=group["ns_steps"],
-                        lr=lr,
-                        weight_decay=weight_decay,
-                        none_grad=group["none_grad"],
-                    )
-        with record_function("muon::barrier"):
-            dist.barrier()
-        with record_function("muon::pipeline"):
-            run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
-    def _step_muon(self, group, qk_logits=None):
-        params = group["params"]
-        lr = group["lr"]
-        weight_decay = group["weight_decay"]
-        momentum = group["momentum"]
-        names = group["names"]
-        # Apply momentum to all params before routing/expansion.
-        with record_function("muon::momentum"):
-            for n, p in zip(names, params):
-                g = p.grad
-                if g is None:
-                    continue
-                g = update_g(self.state, p, g, group, momentum)
-                p.grad = g
-        # Expand expert params by splitting on dim 0.
-        names, params = _expand_expert_params(names, params, self.expert_keys)
-        param_dtensors = []
-        name_dtensors = []
-        param_tensors = []
-        name_tensors = []
-        param_dtensors_small = []
-        name_dtensors_small = []
-        if self.use_distributed_muon:
-            self.distributed_muon(names=names,
-                                  params=params,
-                                  group=group,
-                                  lr=lr,
-                                  weight_decay=weight_decay,
-                                  qk_logits=qk_logits)
-            return
-        # For simplicity, we use distributed Muon for small parameters
-        # whose number of elements is below a threshold.
-        for n, p in zip(names, params):
-            if p is None or p.grad is None:
-                continue
-            if isinstance(p.data, DTensor):
-                if all(
-                        isinstance(placement, Replicate)
-                        for placement in p.placements):
-                    param_tensors.append(p)
-                    name_tensors.append(n)
-                elif p.data.numel() <= self.small_param_numel_threshold:
-                    param_dtensors_small.append(p)
-                    name_dtensors_small.append(n)
-                else:
-                    param_dtensors.append(p)
-                    name_dtensors.append(n)
-            elif isinstance(p.data, torch.Tensor):
-                param_tensors.append(p)
-                name_tensors.append(n)
-            else:
-                raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-        logger.debug(
-            f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors, "
-            f"{len(param_dtensors_small)} Small DTensors")
-        def group_dtensors(dtensors, names):
-            # To support different placements, we group parameters by placements
-            # and run parallel Muon on each group.
-            placement_to_params = defaultdict(lambda: ([], []))
-            assert len(dtensors) == len(names)
-            for p, n in zip(dtensors, names):
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][0].append(n)
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][1].append(p)
-            return placement_to_params
-        if len(param_dtensors_small) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            self.distributed_muon(
-                params=param_dtensors_small,
-                names=name_dtensors_small,
-                group=group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
-        if len(param_dtensors) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            dtensor_group = group_dtensors(param_dtensors, name_dtensors)
-            for _, (names, params) in dtensor_group.items():
-                self.parallel(
-                    names,
-                    params,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    qk_logits=qk_logits,
-                )
-        if len(param_tensors) > 0:
-            self.base(
-                name_tensors,
-                param_tensors,
-                group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
-    @torch.no_grad
-    def step(self, closure=None, qk_logits=None):
-        """Perform a single optimization step.
-        Args:
-            closure (Callable, optional): A closure that reevaluates the model
-                and returns the loss.
-            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
-                to 1D tensors of shape (num_heads,), representing the maximum
-                QK logits across all tokens, computed as
-                (1 / sqrt(head_dim)) * (Q @ K^T).
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        for group in self.param_groups:
-            if group["use_muon"]:
-                self._step_muon(group, qk_logits=qk_logits)
-            else:
-                step_adamw(self.state, group)
-        return loss

build/torch210-cxx11-cu130-x86_64-linux/newton_schulz.py DELETED Viewed

@@ -1,50 +0,0 @@
-import torch
-from .matmul_transpose_triton import matmul_transpose_assign
-COMM_DTYPE = torch.bfloat16
-DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
-@torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    assert G.dtype == COMM_DTYPE
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
-        matmul_transpose_assign(X, buf1)
-        matmul_transpose_assign(buf1, buf2)
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X

build/torch210-cxx11-cu130-x86_64-linux/optimizer/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu130-x86_64-linux/pipeline.py DELETED Viewed

@@ -1,390 +0,0 @@
-import logging
-from typing import Generator
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor
-from torch.profiler import record_function
-from .core import _muon_state, adjust_lr_for_muon, update_p
-from .newton_schulz import COMM_DTYPE, _zeropower_via_newtonschulz5
-from .qk_clip import compute_scales
-logger = logging.getLogger(__name__)
-# ======================================================================
-# Stage helpers
-# ======================================================================
-def _launch_gather(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
-    """Allocate gather buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_gather``).
-        gathered_grads: ``{id(p): empty_tensor}`` for owned params,
-            ``None`` for non-owned.
-        recv_counts: Per-source-rank element counts.
-    """
-    # Allocate gathered-grad buffers
-    gathered_grads: dict[int, torch.Tensor | None] = {}
-    for p in params:
-        state = param_to_state[id(p)]
-        if rank == state.worker_rank:
-            gathered_grads[id(p)] = torch.empty(p.shape,
-                                                dtype=COMM_DTYPE,
-                                                device="cuda")
-        else:
-            gathered_grads[id(p)] = None
-    # Build send buffer
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
-    send_counts = [0] * num_ranks
-    for p in params:
-        state = param_to_state[id(p)]
-        dst = state.worker_rank
-        assert dst < num_ranks
-        shard_elems = state.rank_numels[rank]
-        g = p.grad
-        g = g.to_local().to(COMM_DTYPE).contiguous()
-        assert g.numel() == shard_elems
-        per_dst[dst].append(g.view(-1))
-        send_counts[dst] += shard_elems
-    assert any(
-        len(v) > 0 for v in
-        per_dst), "At least one destination rank must receive a sharded tensor"
-    per_dst_flat = [t for dst in per_dst for t in dst]
-    send_buf = torch.cat(per_dst_flat, dim=0)
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            total += state.rank_numels[src]
-        recv_counts[src] = total
-    recv_buf = torch.empty(sum(recv_counts), dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    logger.debug(f"send_buf size: {send_buf.numel()}, "
-                 f"recv_buf size: {recv_buf.numel()}, "
-                 f"recv_counts: {recv_counts}, "
-                 f"send_counts: {send_counts}, "
-                 f"process_group: {str(process_group)}")
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, gathered_grads, recv_counts
-def _complete_gather(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-) -> None:
-    """Reconstruct gathered grads from the recv buffer (in-place)."""
-    off = 0
-    for src in range(len(recv_counts)):
-        if recv_counts[src] == 0:
-            continue
-        block = recv_counts[src]
-        inner_off = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            indices = state.rank_indices[src]
-            shard_view = gathered_grads[id(p)][indices]
-            n = shard_view.numel()
-            assert n > 0
-            sg = recv_buf.narrow(0, off + inner_off, n)
-            sg = sg.reshape(shard_view.shape)
-            gathered_grads[id(p)][indices] = sg
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _compute_ns(
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    ns_steps: int,
-) -> dict[int, torch.Tensor | None]:
-    """Run Newton-Schulz orthogonalization on owned parameters.
-    Returns:
-        computed_us: ``{id(p): orthogonalized_update}`` for owned params.
-    """
-    computed_us: dict[int, torch.Tensor | None] = {}
-    for p in owned_params:
-        u = _zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
-        gathered_grads[id(p)] = None  # free gathered grad
-        computed_us[id(p)] = u
-    return computed_us
-def _launch_scatter(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-    computed_us: dict[int, torch.Tensor | None],
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor], list[int]]:
-    """Allocate scatter buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
-        scattered_us: ``{id(p): empty_local_tensor}`` for all params.
-        recv_counts: Per-source-rank element counts.
-    """
-    # Allocate scattered-u buffers
-    scattered_us: dict[int, torch.Tensor] = {}
-    for p in params:
-        scattered_us[id(p)] = torch.empty_like(p.to_local(), dtype=COMM_DTYPE)
-    # Build send buffer (from computed_us on owner ranks)
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
-    send_counts = [0] * num_ranks
-    if owned_params:
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert computed_us[id(p)] is not None
-            u_full = computed_us[id(p)].to(COMM_DTYPE).contiguous()
-            total_sent = 0
-            for dst_rank in range(num_ranks):
-                indices = state.rank_indices[dst_rank]
-                su = u_full[indices].flatten()
-                n = su.numel()
-                assert n > 0
-                per_dst[dst_rank].append(su)
-                send_counts[dst_rank] += n
-                total_sent += n
-            assert total_sent == u_full.numel()
-    lengths = [len(v) for v in per_dst]
-    if all(l > 0 for l in lengths):
-        assert all(
-            l == lengths[0] for l in lengths
-        ), "All destination ranks must have the same number of sharded tensor"
-        per_dst_flat = [t for dst in per_dst for t in dst]
-        send_buf = torch.cat(per_dst_flat, dim=0)
-    else:
-        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            total += state.rank_numels[rank]
-        recv_counts[src] = total
-    recv_total = sum(recv_counts)
-    assert recv_total > 0
-    recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, scattered_us, recv_counts
-def _complete_scatter(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-) -> None:
-    """Copy recv buffer into scattered_us (in-place)."""
-    off = 0
-    for src in range(len(recv_counts)):
-        block = recv_counts[src]
-        if block == 0:
-            continue
-        inner_off = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            n = state.rank_numels[rank]
-            assert n > 0
-            flat_local = recv_buf.narrow(0, off + inner_off,
-                                         n).view_as(p.to_local())
-            scattered_us[id(p)].copy_(flat_local)
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _update_params(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-    lr: float,
-    weight_decay: float,
-) -> None:
-    """Apply weight decay, Muon update, and optional QK clipping."""
-    for p in params:
-        state = param_to_state[id(p)]
-        u_dtensor = DTensor.from_local(
-            scattered_us[id(p)],
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-        update_p(p, u_dtensor, lr, adjusted_lr, weight_decay)
-        # QK clipping – applied directly on the local tensor to
-        # avoid DTensor sharding-propagation issues with _StridedShard.
-        scales_full = compute_scales(
-            p,
-            state.qk_clip_state) if state.qk_clip_state is not None else None
-        if scales_full is not None:
-            ratio = p.shape[0] // scales_full.shape[0]
-            idx0 = state.rank_indices[rank][0]
-            if isinstance(idx0, slice):
-                start = idx0.start or 0
-                idx0 = torch.arange(start,
-                                    idx0.stop,
-                                    device=scales_full.device)
-            row_scales = scales_full[idx0 // ratio]
-            p._local_tensor.mul_(row_scales.view(-1, 1))
-# ======================================================================
-# Main generator – thin orchestrator that wires stages together.
-# ======================================================================
-@torch.no_grad()
-def muon_chunk_pipeline(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    ns_steps: int,
-    lr: float,
-    weight_decay: float,
-    none_grad: bool,
-) -> Generator[None, None, None]:
-    """Process one chunk of parameters through the full Muon pipeline.
-    Stages: gather -> compute (Newton-Schulz) -> scatter -> update.
-    Each ``yield`` lets :func:`run_pipeline` interleave other chunks so
-    that communication and computation overlap across chunks.  Async
-    communication is launched via ``async_op=True`` and completed after
-    the yield with ``work.wait()``.
-    Overlap happens because :func:`run_pipeline` admits one new chunk
-    per iteration (staggered admission).  While chunk *N* does NS
-    compute on the default CUDA stream, chunk *N+1*'s async all-to-all
-    runs concurrently on the NCCL stream — no separate ``comm_stream``
-    is required.
-    Yields exactly **2** times:
-    1. After launching async all-to-all gather.
-    2. After launching async all-to-all scatter.
-    """
-    process_group = param_to_state[id(params[0])].process_group
-    num_ranks = dist.get_world_size(group=process_group)
-    owned_params = [
-        p for p in params if param_to_state[id(p)].worker_rank == rank
-    ]
-    # Stages 1-2: launch async gather.
-    with record_function("muon::launch_gather"):
-        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group)
-        if none_grad:
-            for p in params:
-                p.grad = None
-    yield  # --- YIELD 1: other chunks can launch their gather ---
-    with record_function("muon::wait_gather"):
-        work.wait()
-        _complete_gather(recv_buf, recv_counts, owned_params, gathered_grads,
-                         param_to_state, rank)
-        del recv_buf
-    # Stage 3: Newton-Schulz orthogonalization.
-    with record_function("muon::newton_schulz"):
-        computed_us = _compute_ns(owned_params, gathered_grads, ns_steps)
-        gathered_grads.clear()
-    # Stages 4-5: launch async scatter.
-    with record_function("muon::launch_scatter"):
-        work, recv_buf, scattered_us, recv_counts = _launch_scatter(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group, computed_us)
-        computed_us.clear()
-    yield  # --- YIELD 2: other chunks can launch their scatter ---
-    with record_function("muon::wait_scatter"):
-        work.wait()
-        _complete_scatter(recv_buf, recv_counts, params, param_to_state, rank,
-                          scattered_us)
-        del recv_buf
-    # Stage 6: apply parameter updates.
-    with record_function("muon::update_params"):
-        _update_params(params, param_to_state, rank, scattered_us, lr,
-                       weight_decay)
-        scattered_us.clear()

build/torch210-cxx11-cu130-x86_64-linux/qk_clip.py DELETED Viewed

@@ -1,129 +0,0 @@
-import logging
-import math
-from dataclasses import dataclass
-import torch
-from torch.distributed.tensor import DTensor
-logger = logging.getLogger(__name__)
-def parse_qk_layer(name: str) -> tuple[str | None, int]:
-    """
-    Parse a parameter name to check if it is a query/key projection layer
-    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
-    Returns:
-        (kind, layer_idx) or (None, -1) if not matched.
-    Example:
-        'model.3.attn.wq.weight'      -> ('wq', 3)
-        'model.5.attn.wk.weight'      -> ('wk', 5)
-        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
-        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
-        'model.4.attn.v_proj.weight'  -> (None, -1)
-    """
-    parts = name.split('.')
-    if len(parts) < 3:
-        return None, -1
-    kind = parts[-2]
-    layer_idx = -1
-    for part in reversed(parts):
-        if part.isdigit():
-            layer_idx = int(part)
-            break
-    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
-        return kind, layer_idx
-    return None, -1
-@dataclass
-class QKClipInfo:
-    """Per-parameter dynamic info computed from config + runtime logits."""
-    kind: str | None  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
-    indices: list[int]  # which heads to consider for clipping
-    head_dim: int  # from config
-    threshold: float  # from config
-    logit: torch.Tensor | None
-def get_qk_clip_info(clip_config, n, qk_logits):
-    """Extract QK clipping info for a named parameter.
-    Args:
-        clip_config: QK clipping configuration dict (or None).
-        n: Parameter name string.
-        qk_logits: Dict mapping layer indices to logit tensors (or None).
-    Returns:
-        QKClipInfo instance with clipping configuration for this parameter.
-    """
-    if clip_config is None:
-        return None
-    head_dim = clip_config.get('head_dim')
-    threshold = clip_config.get('threshold')
-    kind, layer_idx = parse_qk_layer(n)
-    logit, indices = None, []
-    if qk_logits is not None and kind is not None:
-        logit = qk_logits[layer_idx]
-        indices_key = 'q_indices' if 'q' in kind else 'k_indices'
-        indices = clip_config.get(indices_key, []) or []
-        if isinstance(logit, DTensor):
-            # In TP settings, qk_logits may be DTensor
-            # We convert it to full tensor here for simplicity
-            logit = logit.full_tensor()
-    return QKClipInfo(
-        kind=kind,
-        indices=indices,
-        head_dim=head_dim,
-        threshold=threshold,
-        logit=logit,
-    )
-def compute_scales(p, qk_clip_state):
-    """Compute per-head scaling factors for QK clipping.
-    Returns scales tensor if any head exceeds threshold, else None.
-    """
-    kind = qk_clip_state.kind
-    indices = qk_clip_state.indices
-    head_dim = qk_clip_state.head_dim
-    threshold = qk_clip_state.threshold
-    logit = qk_clip_state.logit
-    H_global = p.shape[0] // head_dim
-    scales_full = torch.ones(H_global, device=p.data.device)
-    scaling = 0
-    for logit_idx, head_idx in enumerate(indices):
-        v_ele = float(logit[logit_idx])
-        if v_ele > threshold:
-            new_scale = math.sqrt(threshold / v_ele)
-            if new_scale < scales_full[head_idx]:
-                scales_full[head_idx] = new_scale
-                logger.info(
-                    f"[{kind}] Head {head_idx} exceeded threshold "
-                    f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
-                )
-                scaling += 1
-    return scales_full if scaling > 0 else None
-def qk_clip(p, scales, head_dim):
-    """Apply per-head scaling to a Q/K projection weight matrix."""
-    if isinstance(p, torch.nn.Parameter):
-        W = p.data.view(-1, head_dim, p.data.shape[1])
-        W.mul_(scales.view(-1, 1, 1))
-    else:
-        W = p.view(-1, head_dim, p.shape[1])
-        W.mul_(scales.view(-1, 1, 1))

build/torch210-cxx11-rocm70-x86_64-linux/adamw.py DELETED Viewed

@@ -1,154 +0,0 @@
-from collections import defaultdict
-from typing import cast
-import torch
-from torch.distributed.tensor import DTensor
-def fused_adamw(
-    params: list[torch.Tensor],
-    grads: list[torch.Tensor],
-    exp_avgs: list[torch.Tensor],
-    exp_avg_sqs: list[torch.Tensor],
-    max_exp_avg_sqs: list[torch.Tensor],
-    state_steps: list[torch.Tensor],
-    amsgrad: bool,
-    beta1: float,
-    beta2: float,
-    lr: float | torch.Tensor,
-    weight_decay: float,
-    eps: float,
-    maximize: bool,
-) -> None:
-    if not params:
-        return
-    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-    # treating it as a scalar.
-    lr_dict: dict | None = ({
-        lr.device: lr
-    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
-    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-         state_steps]  # type: ignore[list-item]
-    )
-    for (device, _), (
-        (
-            device_params_,
-            device_grads_,
-            device_exp_avgs_,
-            device_exp_avg_sqs_,
-            device_max_exp_avg_sqs,
-            device_state_steps_,
-        ),
-            _,
-    ) in grouped_tensors.items():
-        device_params = cast(list[torch.Tensor], device_params_)
-        device_grads = cast(list[torch.Tensor], device_grads_)
-        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-        if lr_dict is not None and device not in lr_dict:
-            lr_dict[device] = lr.to(
-                device=device, non_blocking=True)  # type: ignore[union-attr]
-            lr = lr_dict[device]
-        torch._foreach_add_(device_state_steps, 1)
-        func = torch._fused_adamw_
-        func(
-            device_params,
-            device_grads,
-            device_exp_avgs,
-            device_exp_avg_sqs,
-            device_max_exp_avg_sqs,  # type: ignore[arg-type]
-            device_state_steps,
-            amsgrad=amsgrad,
-            lr=lr,  # type: ignore[arg-type]
-            beta1=beta1,
-            beta2=beta2,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=maximize,
-        )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
-    params_with_grads = []
-    grads = []
-    moment1 = []
-    moment2 = []
-    max_exp_avg_sqs = []
-    state_steps = []
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    for p in params:
-        g = p.grad
-        if g is None:
-            continue
-        state = optimizer_state[p]
-        params_with_grads.append(p)
-        grads.append(g)
-        if "step" not in state:
-            state["step"] = (torch.zeros((),
-                                         dtype=torch.float32,
-                                         device=p.device))
-            state["moment1"] = torch.zeros_like(g)
-            state["moment2"] = torch.zeros_like(g)
-        moment1.append(state["moment1"])
-        moment2.append(state["moment2"])
-        if not isinstance(state["step"], torch.Tensor):
-            step_tensor = torch.tensor(state["step"],
-                                       dtype=torch.float32,
-                                       device=p.device)
-        else:
-            step_tensor = state["step"]
-        state_steps.append(step_tensor)
-    fused_adamw(
-        params_with_grads,
-        grads,
-        moment1,
-        moment2,
-        max_exp_avg_sqs,
-        state_steps,
-        amsgrad=False,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        eps=eps,
-        maximize=False,
-    )
-def step_adamw(optimizer_state, group):
-    """Dispatch AdamW step, grouping parameters by type and placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        group: Parameter group dict.
-    """
-    params = group["params"]
-    # group params with its type and placement
-    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
-    for p in params:
-        match p:
-            case DTensor():
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])].append(p)
-            case torch.Tensor():
-                placement_to_params[tuple([torch.Tensor, None])].append(p)
-    for group_params in placement_to_params.values():
-        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-rocm70-x86_64-linux/async_utils.py DELETED Viewed

@@ -1,77 +0,0 @@
-import logging
-from typing import Generator
-logger = logging.getLogger(__name__)
-class _Task:
-    """Internal: wraps a generator, advances one yield at a time."""
-    def __init__(self, generator: Generator[None, None, None], index: int):
-        self._generator = generator
-        self._index = index
-        self._steps_completed = 0
-        self.step()  # run to first yield
-    def step(self) -> bool:
-        try:
-            next(self._generator)
-            self._steps_completed += 1
-            logger.debug("pipeline[%d] completed stage %d", self._index,
-                         self._steps_completed)
-            return True
-        except StopIteration:
-            logger.debug("pipeline[%d] finished after %d stages", self._index,
-                         self._steps_completed)
-            return False
-    def close(self):
-        self._generator.close()
-def run_pipeline(
-    pipelines: Generator[Generator[None, None, None], None, None],
-    max_concurrent: int,
-) -> None:
-    """Run generator-based pipelines with bounded concurrency.
-    Each pipeline is a generator that yields at stage boundaries.
-    The runtime interleaves pipelines so communication and computation
-    overlap across chunks.
-    """
-    if max_concurrent <= 0:
-        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
-    have_new = True
-    task_index = 0
-    previous_tasks: list[_Task] = []
-    try:
-        while have_new or previous_tasks:
-            running_tasks: list[_Task] = []
-            # Admit one new pipeline per iteration (staggered admission).
-            # Admitting one at a time ensures that while chunk N does NS
-            # compute on the default stream, chunk N+1's NCCL all-to-all
-            # runs concurrently on the NCCL stream — creating real
-            # communication/computation overlap on the GPU.
-            if have_new and len(previous_tasks) < max_concurrent:
-                try:
-                    gen = next(pipelines)
-                    task = _Task(gen, task_index)
-                    task_index += 1
-                    running_tasks.append(task)
-                except StopIteration:
-                    have_new = False
-            # Advance every previously-yielded task by one step.
-            for task in previous_tasks:
-                if task.step():
-                    running_tasks.append(task)
-            previous_tasks = running_tasks
-    except BaseException:
-        # Clean up all in-flight generators to release GPU resources.
-        for task in previous_tasks:
-            task.close()
-        raise

build/torch210-cxx11-rocm70-x86_64-linux/core.py DELETED Viewed

@@ -1,116 +0,0 @@
-import math
-from dataclasses import dataclass
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.tensor import DTensor
-@dataclass
-class _muon_state:
-    worker_rank: int
-    process_group: ProcessGroup
-    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
-    rank_numels: dict[int, int]  # local_rank -> numel
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-def update_g(optimizer_state, p, g, group, momentum):
-    """Apply momentum update to gradient.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        p: Parameter tensor.
-        g: Gradient tensor.
-        group: Parameter group dict.
-        momentum: Momentum coefficient.
-    Returns:
-        Momentum-updated gradient tensor.
-    """
-    state = optimizer_state[p]
-    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-    torch.add(g, buf, alpha=momentum, out=buf)
-    if group["nesterov"]:
-        g.add_(buf, alpha=momentum)
-        return g
-    return buf
-def update_p(p, u, lr, adjusted_lr, weight_decay):
-    """Apply weight decay and orthogonalized update to parameter.
-    Args:
-        p: Parameter (torch.nn.Parameter or DTensor).
-        u: Orthogonalized update tensor.
-        lr: Base learning rate.
-        adjusted_lr: Size-adjusted learning rate.
-        weight_decay: Weight decay coefficient.
-    """
-    if isinstance(p, torch.nn.Parameter):
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    else:
-        p.mul_(1 - lr * weight_decay)
-        p.add_(u, alpha=-adjusted_lr)
-def adjust_lr_for_muon(lr, param_shape):
-    """Scale learning rate based on parameter matrix dimensions.
-    Args:
-        lr: Base learning rate.
-        param_shape: Shape of the parameter tensor.
-    Returns:
-        Adjusted learning rate.
-    """
-    A, B = param_shape[:2]
-    # We adjust the learning rate and weight decay based on the size of the parameter matrix
-    # as described in the paper
-    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-    adjusted_lr = lr * adjusted_ratio
-    return adjusted_lr
-def default_is_muon(name, x, expert_keys=None):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    if any(key in name for key in skip_keys):
-        return False
-    effective_ndim = x.ndim
-    if expert_keys and any(key in name for key in expert_keys):
-        effective_ndim -= 1
-    return effective_ndim >= 2
-def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
-    if is_muon_func is None:
-        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
-    muon_params, muon_names = [], []
-    non_muon_params = []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
-            continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]

build/torch210-cxx11-rocm70-x86_64-linux/distributed/utils.py DELETED Viewed

@@ -1,234 +0,0 @@
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import DTensor
-from torch.distributed.tensor.placement_types import (Placement, Shard,
-                                                      _StridedShard)
-def _is_shard(placement: Placement) -> bool:
-    """Check if a placement is a shard type (Shard or _StridedShard).
-    In PyTorch 2.10+, _StridedShard no longer inherits from Shard, so
-    ``placement.is_shard()`` returns False for _StridedShard.  This helper
-    handles both old and new hierarchies.
-    """
-    return isinstance(placement, (Shard, _StridedShard))
-def get_slices_of_dtensor(
-    target: DTensor | torch.Tensor,
-    local_rank: int,
-    shard_mesh: DeviceMesh,
-    shard_placements: tuple[Placement],
-) -> tuple[slice | torch.Tensor, ...]:
-    """
-    Get per-dimension indices for a given rank's shard of the target tensor.
-    Uses ``Shard.local_shard_size_and_offset`` and
-    ``_StridedShard.local_shard_size_and_offset`` for correct handling of
-    both contiguous and strided (non-contiguous) sharding.
-    Args:
-        target (DTensor | torch.Tensor): The target tensor (for its shape).
-        local_rank (int): The local rank within the shard group.
-        shard_mesh (DeviceMesh): The shard mesh (only shard dimensions).
-        shard_placements (tuple[Placement]): The shard placements.
-    Returns:
-        A tuple of indices (one per tensor dim).  Each element is either:
-        - A ``slice`` (for contiguous or unsharded dims)
-        - A 1-D ``torch.LongTensor`` of indices (for strided sharding)
-    """
-    # find the global rank of the local rank in the shard mesh
-    rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
-    rank_coords = (shard_mesh.mesh == rank).nonzero()
-    assert len(rank_coords) == 1
-    rank_coords = tuple(rank_coords[0].tolist())
-    assert len(rank_coords) == len(shard_placements)
-    # Track per-shard-dim indices.
-    # None means "not yet sharded on this dim".
-    dim_indices: dict[int, torch.Tensor] = {}
-    # Caution: Assuming replicate-to-shard of the shard mesh goes with
-    # left-to-right sharding. This is ensured by the sorting logic of
-    # construct_shard_mesh function.
-    for mesh_dim_idx, (rank_coord, placement) in enumerate(
-            zip(rank_coords, shard_placements)):
-        assert _is_shard(placement)
-        num_chunks = shard_mesh.mesh.shape[mesh_dim_idx]
-        shard_dim = placement.dim
-        # Current effective size on this dim (may already be sub-sharded)
-        if shard_dim in dim_indices:
-            curr_size = len(dim_indices[shard_dim])
-        else:
-            curr_size = target.size()[shard_dim]
-        if curr_size % num_chunks != 0:
-            raise NotImplementedError(
-                f"Dimension size {curr_size} is not divisible "
-                f"by number of ranks {num_chunks} for shard "
-                f"placement on dim {shard_dim}. (shape: {target.shape})")
-        # Compute indices for this level of sharding
-        if isinstance(placement, _StridedShard):
-            _shard_size, offsets = _StridedShard.local_shard_size_and_offset(
-                placement,
-                curr_size,
-                num_chunks,
-                rank_coord,
-                return_first_offset=False)
-            new_indices = torch.tensor(offsets, dtype=torch.long)
-        else:
-            shard_size, offset = Shard.local_shard_size_and_offset(
-                curr_size, num_chunks, rank_coord)
-            new_indices = torch.arange(offset,
-                                       offset + shard_size,
-                                       dtype=torch.long)
-        # Compose with previous indices on this dim
-        if shard_dim in dim_indices:
-            dim_indices[shard_dim] = dim_indices[shard_dim][new_indices]
-        else:
-            dim_indices[shard_dim] = new_indices
-    # Build result tuple
-    result: list[slice | torch.Tensor] = []
-    for d in range(len(target.size())):
-        if d not in dim_indices:
-            result.append(slice(None))
-        else:
-            indices = dim_indices[d]
-            # Convert contiguous indices to slice for efficiency
-            if len(indices) > 0:
-                start = indices[0].item()
-                expected = torch.arange(start,
-                                        start + len(indices),
-                                        dtype=torch.long)
-                if torch.equal(indices, expected):
-                    result.append(slice(start, start + len(indices)))
-                else:
-                    result.append(indices)
-            else:
-                result.append(slice(0, 0))
-    return tuple(result)
-_ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
-                                                  ProcessGroup]] = dict()
-def construct_shard_mesh(
-    placements: tuple[Placement],
-    mesh: DeviceMesh,
-) -> tuple[DeviceMesh, ProcessGroup, tuple[Placement, ...]]:
-    """Construct shard sub-mesh and ProcessGroup for all-to-all communication.
-    Given a DTensor's placements and device mesh, extracts the "shard group"
-    — the set of ranks that together hold all shards of the same replica —
-    and creates a ProcessGroup for all-to-all among them.
-    Steps:
-        1. Sort placements: Replicate first, then Shard by (dim, granularity).
-        2. Permute the mesh tensor to match the sorted order.
-        3. Collapse Replicate dims → list of shard sub-meshes (one per replica).
-        4. Create/retrieve a cached ProcessGroup for the current rank's sub-mesh.
-    Example — 8 GPUs, mesh shape (2, 2, 2),
-              placements ``[Shard(0), Replicate, _StridedShard(0)]``::
-        Step 1 — Sort: [Replicate, _StridedShard(0), Shard(0)]
-                 Permutation: [1, 2, 0]
-        Step 2 — Permute mesh dims by [1, 2, 0]:
-                 Original:                Permuted:
-                 [[[0,1],[2,3]],          [[[0,2],[1,3]],
-                  [[4,5],[6,7]]]           [[4,6],[5,7]]]
-        Step 3 — Unbind replicate dim (dim 0), giving 2 shard sub-meshes:
-                 sub-mesh 0 = [[0,2],[1,3]]  (replica group 0)
-                 sub-mesh 1 = [[4,6],[5,7]]  (replica group 1)
-                 shard_placements = (_StridedShard(0), Shard(0))
-        Step 4 — Rank 0 → ProcessGroup([0,1,4,5])
-                 Rank 2 → ProcessGroup([2,3,6,7])
-    Returns:
-        ``(shard_mesh, process_group, shard_placements)``
-    """
-    my_rank = dist.get_rank()
-    assert mesh.mesh.device.type == 'cpu'
-    # -- Fast path: 1D all-shard mesh → reuse existing PG. ----------------
-    # This avoids a non-collective dist.new_group() call, which would
-    # deadlock when only a subset of ranks call this function (e.g. expert
-    # DTensors on a TP submesh where ranks 0-3 and 4-7 call separately).
-    if mesh.ndim == 1 and len(placements) == 1 and _is_shard(placements[0]):
-        key = (*mesh.mesh.shape, *mesh.mesh.flatten().tolist())
-        if key not in _ranks_to_dist_cache:
-            _ranks_to_dist_cache[key] = (mesh, mesh.get_group())
-        return (*_ranks_to_dist_cache[key], tuple(placements))
-    mesh_tensor = mesh.mesh.clone()
-    # -- Step 1: Sort placements (Replicate first, then Shard by dim). ------
-    # _StridedShard comes BEFORE regular Shard on the same dim so that
-    # get_slices_of_dtensor applies the outer sharding first, matching
-    # DTensor's left-to-right (outer-to-inner) composition order.
-    def _sort_key(item):
-        index, placement = item
-        assert not placement.is_partial(), "Partial placement not supported"
-        if placement.is_replicate():
-            return (-1, 0, index)
-        assert _is_shard(placement), f"Unsupported: {type(placement)}"
-        split = (-1 / placement.split_factor if isinstance(
-            placement, _StridedShard) else 0)
-        return (placement.dim, split, index)
-    indexed = sorted(enumerate(placements), key=_sort_key)
-    perm, sorted_placements = zip(*indexed)
-    # -- Step 2: Permute mesh to match sorted placement order. --------------
-    sorted_mesh = mesh_tensor.permute(perm)
-    # -- Step 3: Collapse replicate dims → list of shard sub-meshes. --------
-    # E.g. mesh (2, 3, 4, 4) with [R, R, S(0), S(1)] → 6 sub-meshes of (4, 4)
-    num_rep = sum(1 for p in sorted_placements if p.is_replicate())
-    if num_rep > 0:
-        if num_rep > 1:
-            sorted_mesh = sorted_mesh.flatten(0, num_rep - 1)
-        shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
-    else:
-        shard_meshes = [sorted_mesh]
-    shard_placements = sorted_placements[num_rep:]
-    assert len(shard_placements) == len(set(shard_placements))
-    # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
-    # All ranks must call dist.new_group in the same order, even though each
-    # rank only joins one group.
-    def _cache_key(t: torch.Tensor) -> tuple:
-        return (*t.shape, *t.flatten().tolist())
-    my_key = None
-    for sm in shard_meshes:
-        key = _cache_key(sm)
-        if (my_rank == sm).any().item():
-            assert my_key is None, "Rank appears in multiple shard groups"
-            my_key = key
-        if key not in _ranks_to_dist_cache:
-            pg = dist.new_group(sm.flatten().tolist())
-            _ranks_to_dist_cache[key] = (
-                DeviceMesh(device_type="cuda", mesh=sm),
-                pg,
-            )
-    return (*_ranks_to_dist_cache[my_key], shard_placements)

build/torch210-cxx11-rocm70-x86_64-linux/matmul_transpose_triton.py DELETED Viewed

@@ -1,121 +0,0 @@
-# MIT License
-#
-# Copyright (c) 2025 Tianyang Lin
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import torch
-import triton
-import triton.language as tl
-def get_autotune_config():
-    return [
-        triton.Config(
-            {
-                'BLOCK_SIZE_M': blk_m,
-                'BLOCK_SIZE_K': blk_k,
-                'GROUP_SIZE_M': grp_sz
-            },
-            num_stages=n_stages,
-            num_warps=n_warps) for blk_m in [32, 64, 128]
-        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
-        for n_warps in [4, 8]
-    ]
-@triton.autotune(
-    configs=get_autotune_config(),
-    key=['M', 'K'],
-)
-@triton.jit
-def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
-               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-               GROUP_SIZE_M: tl.constexpr):
-    """
-    Core kernel jit function of matmul_transpose that computes y = x @ x.T
-    The code is a simple adaptation from the triton `matmul` tutorial:
-    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
-    """
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    if pid_m > pid_n:
-        return
-    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    # we use a & b ptrs to denote different rows of x.
-    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl.load(a_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
-        a_ptrs += BLOCK_SIZE_K * stride_xk
-        b_ptrs += BLOCK_SIZE_K * stride_xk
-    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
-    # https://github.com/triton-lang/triton/issues/2252
-    c = accumulator.to(x.dtype.element_ty)
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
-    tl.store(c_ptrs, c, mask=c_mask)
-    # transpose and copy
-    if pid_m < pid_n:
-        ct_ptrs = y + stride_ym * offs_cn[:,
-                                          None] + stride_yn * offs_cm[None, :]
-        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
-        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-def matmul_transpose_assign(d_in, d_out):
-    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
-    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
-    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
-    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
-    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
-    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
-    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
-            "First dimension of `d_in` must match first and second dimension of `d_out`"
-    d_in = d_in.contiguous()
-    M, K = d_in.shape
-    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
-        M, META['BLOCK_SIZE_M']), )
-    with torch.cuda.device(d_in.device.index):
-        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
-                         d_out.stride(0), d_out.stride(1))

build/torch210-cxx11-rocm70-x86_64-linux/metadata.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "python-depends": []
-}

build/torch210-cxx11-rocm70-x86_64-linux/muon.py DELETED Viewed

@@ -1,594 +0,0 @@
-import logging
-import types
-from collections import defaultdict
-from typing import Any
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor, Replicate, Shard
-from torch.profiler import record_function
-from .adamw import step_adamw
-from .async_utils import run_pipeline
-from .core import (_muon_state, adjust_lr_for_muon,
-                   get_default_muon_param_groups, update_g, update_p)
-from .distributed.utils import (_is_shard, construct_shard_mesh,
-                                get_slices_of_dtensor)
-from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
-                            _zeropower_via_newtonschulz5)
-from .pipeline import muon_chunk_pipeline
-from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
-logger = logging.getLogger(__name__)
-def _expand_expert_params(names, params, expert_keys):
-    """Expand expert params by splitting on dim 0 (expert dimension).
-    Params whose name matches any key in ``expert_keys`` are treated as
-    expert-parallel tensors.  Their outermost dimension is the expert
-    dimension: an ``(E, out, in)`` tensor becomes ``E`` separate 2D
-    ``nn.Parameter`` views so that in-place updates propagate back to
-    the original storage.
-    Non-expert params with ``ndim > 2`` trigger an ``AssertionError`` —
-    if they are expert params, their key must be added to ``expert_keys``.
-    The grad must already be set on each expert param (e.g. after momentum).
-    For DTensor expert params, placements that shard on dim 0 (expert dim)
-    are consumed by the split.  Non-dim-0 shard placements (e.g. TP) are
-    preserved: each 2D slice is wrapped as a DTensor on the corresponding
-    submesh so the parallel pipeline handles the TP communication.
-    """
-    expanded_names = []
-    expanded_params = []
-    for n, p in zip(names, params):
-        is_expert = expert_keys and any(key in n for key in expert_keys)
-        is_dtensor = isinstance(p.data, DTensor)
-        if not is_expert:
-            assert p.data.ndim <= 2, (
-                f"Param {n} has ndim={p.data.ndim} but does not match "
-                f"expert_keys={expert_keys}. If this is an expert param, "
-                f"add its key to expert_keys.")
-            expanded_names.append(n)
-            expanded_params.append(p)
-            continue
-        g = p.grad
-        assert g is not None, (
-            f"Expert param {n} must have grad set before expansion")
-        tp_mesh = None
-        tp_placements_2d = None
-        if is_dtensor:
-            local_data = p.to_local()
-            local_grad = g.to_local() if isinstance(g, DTensor) else g
-            # Find non-dim-0 shard placements (e.g. TP sharding).
-            # After splitting on dim 0, Shard(k) becomes Shard(k-1).
-            tp_dim_indices = []
-            tp_placements_2d = []
-            for i, pl in enumerate(p.placements):
-                if _is_shard(pl) and pl.dim != 0:
-                    tp_dim_indices.append(i)
-                    tp_placements_2d.append(Shard(pl.dim - 1))
-            if tp_dim_indices:
-                tp_dim_names = tuple(p.device_mesh.mesh_dim_names[i]
-                                     for i in tp_dim_indices)
-                if len(tp_dim_names) == 1:
-                    tp_mesh = p.device_mesh[tp_dim_names[0]]
-                else:
-                    tp_mesh = p.device_mesh[tp_dim_names]
-        else:
-            local_data = p.data
-            local_grad = g
-        # Expand: split dim 0, reshape each slice to 2D.
-        num_local_experts = local_data.shape[0]
-        for i in range(num_local_experts):
-            slice_data = local_data[i]
-            slice_grad = local_grad[i]
-            if tp_mesh is not None:
-                # Wrap as DTensor on TP submesh so the pipeline handles
-                # TP communication (gather/scatter across TP ranks).
-                dt_data = DTensor.from_local(slice_data,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                dt_grad = DTensor.from_local(slice_grad,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                expert_param = torch.nn.Parameter(dt_data, requires_grad=False)
-                expert_param.grad = dt_grad
-            else:
-                expert_param = torch.nn.Parameter(slice_data,
-                                                  requires_grad=False)
-                expert_param.grad = slice_grad
-            expanded_names.append(f"{n}[{i}]")
-            expanded_params.append(expert_param)
-        p.grad = None  # allow expert grad storage to be freed after pipeline
-    return expanded_names, expanded_params
-class Muon(torch.optim.Optimizer):
-    """
-    Muon - MomentUm Orthogonalized by Newton-schulz
-    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
-    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
-    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
-    the advantage that it can be stably run in bfloat16 on the GPU.
-    Some warnings:
-    - We believe this optimizer is unlikely to work well for training with small batch size.
-    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
-    Arguments:
-        model: The model to be optimized by Muon.
-        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
-        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
-        momentum: The momentum used by the internal SGD. (0.95 is a good default)
-        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
-        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        weight_decay: The weight decay for Muon and AdamW.
-            Parameters that are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW instead.
-        adamw_lr: The learning rate for the internal AdamW.
-        adamw_betas: The betas for the internal AdamW.
-        adamw_eps: The epsilon for the internal AdamW.
-        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
-        debug: Whether to print debug information.
-        clip_info : Configuration for QK clipping. Expected keys:
-            - "q_indices" (list[int]): Indices of query heads to consider.
-            - "k_indices" (list[int]): Indices of key heads to consider.
-            - "head_dim" (int): Dimensionality of each attention head.
-            - "threshold" (float): Threshold value; heads whose QK logits exceed
-            this value will be scaled down.
-            Default is:
-                {
-                    "q_indices": [],
-                    "k_indices": [],
-                    "head_dim": 128,
-                    "threshold": 100
-                }
-        warmup_step : How many all2all gather, compute operations are launched in advance
-                      before the corresponding all2all scatter steps begin.
-                      A higher warmup_step increases memory usage but can improve
-                      performance by overlapping communication.
-                      Parallel muon only.
-        chunk_size : Batch size of parameters to process in each
-                     all2all gather/compute/scatter step.
-                     Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
-        use_distributed_muon: Use distributed muon by Liu et al. (2024).
-                              For testing purpose only.
-        small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
-        expert_keys: List of strings to identify expert-parallel parameters.
-                     If any key appears in a parameter's name, its outermost
-                     dimension is treated as the expert dimension and expanded
-                     into per-expert 2D params for Muon.  For example,
-                     ``expert_keys=["experts"]`` matches any param whose name
-                     contains "experts".  3D+ params not matched by any key
-                     will raise an error.
-    """
-    def __init__(self,
-                 params,
-                 lr=1e-3,
-                 momentum=0.95,
-                 nesterov=True,
-                 ns_steps=5,
-                 weight_decay=0.1,
-                 adamw_betas=(0.9, 0.95),
-                 adamw_eps=1e-8,
-                 none_grad=True,
-                 debug=False,
-                 clip_config=None,
-                 warmup_step=5,
-                 chunk_size=-1,
-                 use_distributed_muon=False,
-                 small_param_numel_threshold=65536,
-                 expert_keys=None):
-        defaults = dict(
-            lr=lr,
-            weight_decay=weight_decay,
-            momentum=momentum,
-            nesterov=nesterov,
-            ns_steps=ns_steps,
-            adamw_betas=adamw_betas,
-            adamw_eps=adamw_eps,
-            none_grad=none_grad,
-            use_muon=True,
-        )
-        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
-        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
-        if isinstance(params, types.GeneratorType):
-            raise ValueError(error_message.format(idx=0) + instruction_code)
-        for _idx, param_group in enumerate(params):
-            if param_group.get("use_muon", None) is None:
-                raise ValueError(
-                    error_message.format(idx=_idx) + instruction_code)
-        super().__init__(params, defaults)
-        self.debug = debug
-        self.clip_config = clip_config if clip_config is not None else {
-            "q_indices": [],
-            "k_indices": [],
-            "head_dim": 128,
-            "threshold": 100,
-        }
-        self.warmup_step = warmup_step
-        self.chunk_size = chunk_size
-        self.use_distributed_muon = use_distributed_muon
-        self.small_param_numel_threshold = small_param_numel_threshold
-        self.expert_keys = expert_keys
-    def _calc_flops(self, G, steps):
-        assert len(G.shape) == 2
-        M, N = G.shape
-        if M > N:
-            M, N = N, M
-        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def get_shard_mesh(self, p):
-        """
-        Get the shard mesh for a parameter p on the given rank.
-        """
-        assert isinstance(
-            p, DTensor), "Parallel Muon only supports DTensor parameters."
-        shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
-            p.placements, p.device_mesh)
-        return shard_mesh, shard_pg, shard_placements
-    def init_state_and_assign_params(self, names, params, group, qk_logits):
-        param_to_state = {}
-        param_to_flops = {}
-        total_flops = 0
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            assert g.ndim == 2, "Muon only supports 2D parameters."
-            flops = self._calc_flops(g, group["ns_steps"])
-            param_to_flops[id(p)] = flops
-            total_flops += flops
-        if self.debug:
-            logger.debug("Total TFLOPs for Muon: %.2f TFLOPs",
-                         total_flops / 1e12)
-        paired = list(zip(names, params))
-        paired_sorted = sorted(paired,
-                               key=lambda x: param_to_flops[id(x[1])],
-                               reverse=True)
-        names_sorted, params_sorted = zip(*paired_sorted)
-        ordered_names = list(names_sorted)
-        ordered_params = list(params_sorted)
-        round_robin = 0
-        mesh = ordered_params[0].device_mesh
-        placements = ordered_params[0].placements
-        shard_mesh, shard_pg, shard_placements = self.get_shard_mesh(
-            ordered_params[0])
-        shard_mesh_flattened = shard_mesh.mesh.flatten()
-        num_ranks = dist.get_world_size(group=shard_pg)
-        for n, p in zip(ordered_names, ordered_params):
-            if mesh != p.device_mesh:
-                raise ValueError("All parameters must be on the same mesh.")
-            if placements != p.placements:
-                raise ValueError("All parameters must have same placements.")
-            worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
-            round_robin = (round_robin + 1) % len(shard_mesh_flattened)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            # Precompute per-rank indices and numels for all-to-all.
-            rank_indices: dict[int, tuple] = {}
-            rank_numels: dict[int, int] = {}
-            for r in range(num_ranks):
-                indices = get_slices_of_dtensor(p, r, shard_mesh,
-                                                shard_placements)
-                rank_indices[r] = indices
-                numel = 1
-                for idx, dim_size in zip(indices, p.shape):
-                    if isinstance(idx, slice):
-                        start, stop, step = idx.indices(dim_size)
-                        numel *= max(0, (stop - start + (step - 1)) // step)
-                    else:
-                        numel *= len(idx)
-                rank_numels[r] = numel
-            param_to_state[id(p)] = _muon_state(
-                worker_rank=worker_rank,
-                process_group=shard_pg,
-                rank_indices=rank_indices,
-                rank_numels=rank_numels,
-                name=n,
-                qk_clip_state=qk_clip_state,
-            )
-        return param_to_state, ordered_params
-    def base(self, names, params, group, lr, weight_decay, qk_logits):
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
-                                             steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-            update_p(p, u, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p, scales_full, qk_clip_state.head_dim)
-    def distributed_muon(
-        self,
-        names: list[str],
-        params: list[torch.nn.Parameter],
-        group: dict[str, Any],
-        lr: float,
-        weight_decay: float,
-        qk_logits: list[torch.Tensor | DTensor] | None,
-    ):
-        """ Implementation of Distributed Muon by Liu et al. """
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            # Gather G
-            if isinstance(p.data, DTensor):
-                g_full = g.full_tensor()
-                p_full = p.data.full_tensor()
-            else:
-                g_full = g
-                p_full = p
-            u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
-                                                  steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
-            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p_full, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
-            if isinstance(p.data, DTensor):
-                ndims = len(p.device_mesh.mesh.shape)
-                p_replicate = DTensor.from_local(
-                    p_full,
-                    device_mesh=p.device_mesh,
-                    placements=[Replicate() for _ in range(ndims)],
-                )
-                p_sharded = p_replicate.redistribute(
-                    device_mesh=p.device_mesh,
-                    placements=p.placements,
-                )
-                p.copy_(p_sharded)
-    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
-        """
-        Perform a parallel optimization step using Muon.
-        Parameters are chunked and each chunk is processed by a
-        :func:`muon_chunk_pipeline` generator.  :func:`run_pipeline`
-        interleaves multiple chunks so that communication and computation
-        overlap across chunks (the same overlap previously achieved by the
-        warmup + main-loop index scheduling).
-        """
-        # Momentum is already applied by _step_muon before this method.
-        param_to_state, ordered_params = self.init_state_and_assign_params(
-            names, params, group, qk_logits)
-        # Compute local rank for this group's shard process group.
-        shard_pg = param_to_state[id(ordered_params[0])].process_group
-        rank = dist.get_rank(group=shard_pg)
-        if self.chunk_size == -1:
-            shard_ranks = dist.get_world_size(param_to_state[id(
-                ordered_params[0])].process_group)
-            chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
-        elif self.chunk_size > 0:
-            chunk_size = self.chunk_size
-        else:
-            raise ValueError("chunk_size must be -1 or a positive integer.")
-        def pipelines():
-            for start in range(0, len(ordered_params), chunk_size):
-                chunk = ordered_params[start:start + chunk_size]
-                if chunk:
-                    yield muon_chunk_pipeline(
-                        params=chunk,
-                        param_to_state=param_to_state,
-                        rank=rank,
-                        ns_steps=group["ns_steps"],
-                        lr=lr,
-                        weight_decay=weight_decay,
-                        none_grad=group["none_grad"],
-                    )
-        with record_function("muon::barrier"):
-            dist.barrier()
-        with record_function("muon::pipeline"):
-            run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
-    def _step_muon(self, group, qk_logits=None):
-        params = group["params"]
-        lr = group["lr"]
-        weight_decay = group["weight_decay"]
-        momentum = group["momentum"]
-        names = group["names"]
-        # Apply momentum to all params before routing/expansion.
-        with record_function("muon::momentum"):
-            for n, p in zip(names, params):
-                g = p.grad
-                if g is None:
-                    continue
-                g = update_g(self.state, p, g, group, momentum)
-                p.grad = g
-        # Expand expert params by splitting on dim 0.
-        names, params = _expand_expert_params(names, params, self.expert_keys)
-        param_dtensors = []
-        name_dtensors = []
-        param_tensors = []
-        name_tensors = []
-        param_dtensors_small = []
-        name_dtensors_small = []
-        if self.use_distributed_muon:
-            self.distributed_muon(names=names,
-                                  params=params,
-                                  group=group,
-                                  lr=lr,
-                                  weight_decay=weight_decay,
-                                  qk_logits=qk_logits)
-            return
-        # For simplicity, we use distributed Muon for small parameters
-        # whose number of elements is below a threshold.
-        for n, p in zip(names, params):
-            if p is None or p.grad is None:
-                continue
-            if isinstance(p.data, DTensor):
-                if all(
-                        isinstance(placement, Replicate)
-                        for placement in p.placements):
-                    param_tensors.append(p)
-                    name_tensors.append(n)
-                elif p.data.numel() <= self.small_param_numel_threshold:
-                    param_dtensors_small.append(p)
-                    name_dtensors_small.append(n)
-                else:
-                    param_dtensors.append(p)
-                    name_dtensors.append(n)
-            elif isinstance(p.data, torch.Tensor):
-                param_tensors.append(p)
-                name_tensors.append(n)
-            else:
-                raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-        logger.debug(
-            f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors, "
-            f"{len(param_dtensors_small)} Small DTensors")
-        def group_dtensors(dtensors, names):
-            # To support different placements, we group parameters by placements
-            # and run parallel Muon on each group.
-            placement_to_params = defaultdict(lambda: ([], []))
-            assert len(dtensors) == len(names)
-            for p, n in zip(dtensors, names):
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][0].append(n)
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][1].append(p)
-            return placement_to_params
-        if len(param_dtensors_small) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            self.distributed_muon(
-                params=param_dtensors_small,
-                names=name_dtensors_small,
-                group=group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
-        if len(param_dtensors) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            dtensor_group = group_dtensors(param_dtensors, name_dtensors)
-            for _, (names, params) in dtensor_group.items():
-                self.parallel(
-                    names,
-                    params,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    qk_logits=qk_logits,
-                )
-        if len(param_tensors) > 0:
-            self.base(
-                name_tensors,
-                param_tensors,
-                group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
-    @torch.no_grad
-    def step(self, closure=None, qk_logits=None):
-        """Perform a single optimization step.
-        Args:
-            closure (Callable, optional): A closure that reevaluates the model
-                and returns the loss.
-            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
-                to 1D tensors of shape (num_heads,), representing the maximum
-                QK logits across all tokens, computed as
-                (1 / sqrt(head_dim)) * (Q @ K^T).
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        for group in self.param_groups:
-            if group["use_muon"]:
-                self._step_muon(group, qk_logits=qk_logits)
-            else:
-                step_adamw(self.state, group)
-        return loss

build/torch210-cxx11-rocm70-x86_64-linux/newton_schulz.py DELETED Viewed

@@ -1,50 +0,0 @@
-import torch
-from .matmul_transpose_triton import matmul_transpose_assign
-COMM_DTYPE = torch.bfloat16
-DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
-@torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    assert G.dtype == COMM_DTYPE
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
-        matmul_transpose_assign(X, buf1)
-        matmul_transpose_assign(buf1, buf2)
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X