{"id":24367,"date":"2025-11-19T19:44:46","date_gmt":"2025-11-19T19:44:46","guid":{"rendered":"https:\/\/pokecon.jp\/job\/?p=24367"},"modified":"2025-11-19T19:44:46","modified_gmt":"2025-11-19T19:44:46","slug":"nvidia-nemo%e3%82%92%e5%88%a9%e7%94%a8%e3%81%97%e3%81%9fgpt-oss%e3%81%ae%e5%ad%a6%e7%bf%92","status":"publish","type":"post","link":"https:\/\/pokecon.jp\/job\/24367\/","title":{"rendered":"NVIDIA NeMo\u3092\u5229\u7528\u3057\u305fGPT-OSS\u306e\u5b66\u7fd2"},"content":{"rendered":"\n<\/p>\n<div>\n<h2 id=\"%E3%81%AF%E3%81%98%E3%82%81%E3%81%AB\" data-line=\"0\" class=\"code-line\">\n \u306f\u3058\u3081\u306b<\/h2>\n<p data-line=\"2\" class=\"code-line\"><strong><a target=\"_blank\" href=\"https:\/\/tur.ing\/\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">Turing<\/a> CTO\u5ba4<\/strong>\u306b\u6240\u5c5e\u3057\u3066\u3044\u308b\u6771\u4eac\u79d1\u5b66\u5927\u5b66(Institute of Science Tokyo)\u306e<a target=\"_blank\" href=\"https:\/\/x.com\/okoge_kaz\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">\u85e4\u4e95<\/a>\u3067\u3059\u3002<br \/>\u672c\u8a18\u4e8b\u3067\u306f\u3001OpenAI\u304b\u30892025\u5e748\u6708\u306b\u30ea\u30ea\u30fc\u30b9\u3055\u308c\u305f<strong>gpt-oss<\/strong>\u3092<strong>NVIDIA<\/strong> <a target=\"_blank\" href=\"https:\/\/github.com\/NVIDIA-NeMo\/NeMo\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">NeMo<\/a>\u30d5\u30ec\u30fc\u30e0\u30ef\u30fc\u30af\u306b\u3066\u5b66\u7fd2\u3059\u308b\u305f\u3081\u306e\u65b9\u6cd5\u306b\u3064\u3044\u3066\u89e3\u8aac\u3057\u307e\u3059\u3002<\/p>\n<p data-line=\"5\" class=\"code-line\">2025\u5e7411\u67084\u65e5\u6642\u70b9\u3067\u306f\u3001<a target=\"_blank\" href=\"https:\/\/docs.nvidia.com\/nemo-framework\/user-guide\/latest\/llms\/gpt_oss.html\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">NVIDIA\u516c\u5f0f<\/a>\u304b\u3089\u306f\u3001LoRA finetunig\u3092\u884c\u3046\u65b9\u6cd5\u306b\u3064\u3044\u3066\u306e\u307f\u89e3\u8aac\u3055\u308c\u3066\u304a\u308a\u3001Long Context\u7d99\u7d9a\u4e8b\u524d\u5b66\u7fd2(Continual Pre-Training)\u306a\u3069\u672c\u683c\u7684\u306a\u5b66\u7fd2\u3092\u884c\u3046\u306b\u306f<strong>\u30cf\u30fc\u30c9\u30eb<\/strong>\u304c\u591a\u6570\u3042\u308a\u307e\u3059\u3002<br \/>\u672c\u8a18\u4e8b\u3067\u306f\u3001\u5b66\u7fd2\u3092\u884c\u3046\u305f\u3081\u306b\u89e3\u6c7a\u3059\u308b\u5fc5\u8981\u304c\u3042\u308b\u3059\u3079\u3066\u306e\u554f\u984c\u306b\u95a2\u3057\u3066\u3001<strong>\u8a73\u7d30\u306a\u89e3\u6c7a\u65b9\u6cd5<\/strong>\u3092\u8a18\u3057\u307e\u3057\u305f\u3002gpt-oss\u3092\u5229\u7528\u3057\u305f\u30e2\u30c7\u30eb\u5b66\u7fd2\u306b\u304a\u5f79\u7acb\u3066\u304f\u3060\u3055\u3044\u3002<\/p>\n<h2 id=\"gpt-oss\" data-line=\"8\" class=\"code-line\">\n gpt-oss<\/h2>\n<h3 id=\"about\" data-line=\"10\" class=\"code-line\">\n About<\/h3>\n<p data-line=\"12\" class=\"code-line\">gpt-oss\u3068\u306f\u3001OpenAI\u3088\u308a\u30ea\u30ea\u30fc\u30b9\u3055\u308c\u305fLLM\u3067\u3042\u308a\u3001<a target=\"_blank\" href=\"https:\/\/huggingface.co\/openai\/gpt-oss-20b\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">gpt-oss-20b<\/a>\u3068<a target=\"_blank\" href=\"https:\/\/huggingface.co\/openai\/gpt-oss-120b\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">gpt-oss-120b<\/a>\u306e2\u3064\u306e\u30e2\u30c7\u30eb\u30b5\u30a4\u30ba\u304c\u3042\u308a\u307e\u3059\u3002\u3044\u305a\u308c\u306e\u30e2\u30c7\u30eb\u3082\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u9ad8\u3044\u8a00\u8a9e\u51e6\u7406\u80fd\u529b\u3092\u82f1\u8a9e\u3067\u306f\u793a\u3057\u3066\u3044\u307e\u3059\u3002<\/p>\n<p data-line=\"14\" class=\"code-line\"><img decoding=\"async\" src=\"https:\/\/storage.googleapis.com\/zenn-user-upload\/e929fc4f1b11-20251104.png\" alt=\"\" class=\"md-img\" loading=\"lazy\"\/><br \/><a target=\"_blank\" href=\"https:\/\/artificialanalysis.ai\/models\/open-source\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">Artificial Analysis<\/a>\u3088\u308a<\/p>\n<p data-line=\"17\" class=\"code-line\">\u3057\u304b\u3057\u306a\u304c\u3089\u3001\u65e5\u672c\u306b\u95a2\u3059\u308b\u77e5\u8b58\u3084\u65e5\u672c\u8a9e\u80fd\u529b\u306b\u3064\u3044\u3066\u306f\u9650\u5b9a\u3055\u308c\u3066\u304a\u308a\u3001\u6539\u5584\u306e\u4f59\u5730\u304c\u5b58\u5728\u3057\u307e\u3059\u3002<\/p>\n<aside class=\"msg message\"><span class=\"msg-symbol\">!<\/span><\/p>\n<div class=\"msg-content\">\n<p data-line=\"21\" class=\"code-line\">\u65e5\u672c\u8a9e\u80fd\u529b\u3001\u65e5\u672c\u8a9e\u77e5\u8b58\u306b\u6539\u5584\u4f59\u5730\u304c\u3042\u308b\u3068\u3044\u3046\u3053\u3068\u306f\u3001\u7d99\u7d9a\u4e8b\u524d\u5b66\u7fd2(continual pre-training)\u306a\u3069\u306e\u8ffd\u52a0\u5b66\u7fd2\u304c\u6709\u52b9\u3068\u3044\u3046\u3053\u3068\u3067\u3057\u3087\u3046\u304b\uff1f<\/p>\n<p data-line=\"23\" class=\"code-line\">\u306f\u3044\u3001\u305d\u306e\u3068\u304a\u308a\u3067\u3059\u3002\u3057\u304b\u3057\u306a\u304c\u3089\u3001gpt-oss\u306e\u82f1\u8a9e\u80fd\u529b\u3001\u6570\u5b66\u80fd\u529b\u3001\u30b3\u30fc\u30c9\u80fd\u529b\u3001\u6df1\u3044\u63a8\u8ad6\u3092\u4f34\u3046Reasoning(\u63a8\u8ad6)\u80fd\u529b\u306a\u3069\u3092\u640d\u306a\u308f\u305a\u306b\u65e5\u672c\u8a9e\u80fd\u529b\u3001\u65e5\u672c\u8a9e\u77e5\u8b58\u3092\u5f37\u5316\u3059\u308b\u306e\u306f<strong>\u5bb9\u6613\u3067\u306f\u3042\u308a\u307e\u305b\u3093(=\u56f0\u96e3\u3067\u3059)<\/strong>\u3002<br \/>\u65e5\u672c\u8a9e\u30c7\u30fc\u30bf\u3067\u5358\u306b\u7d99\u7d9a\u4e8b\u524d\u5b66\u7fd2\u3057\u305f\u308a\u3001SFT\u3092\u884c\u3046\u3060\u3051\u3067\u306f\u3001gpt-oss\u304c\u6709\u3057\u3066\u3044\u308b\u9ad8\u3044\u8af8\u80fd\u529b\u3092\u5927\u304d\u304f\u640d\u306a\u3046\u53ef\u80fd\u6027\u304c\u9ad8\u3044\u305f\u3081\u3001\u5de5\u592b\u304c\u5fc5\u8981\u3067\u3059\u3002<\/p>\n<\/div>\n<\/aside>\n<h3 id=\"%E3%83%A2%E3%83%87%E3%83%AB%E3%82%A2%E3%83%BC%E3%82%AD%E3%83%86%E3%82%AF%E3%83%81%E3%83%A3\" data-line=\"28\" class=\"code-line\">\n \u30e2\u30c7\u30eb\u30a2\u30fc\u30ad\u30c6\u30af\u30c1\u30e3<\/h3>\n<p data-line=\"30\" class=\"code-line\">gpt-oss\u306e\u30e2\u30c7\u30eb\u30a2\u30fc\u30ad\u30c6\u30af\u30c1\u30e3\u306b\u306f\u7279\u7b46\u3059\u308b\u3079\u304d\u70b9\u304c\u3044\u304f\u3064\u304b\u3042\u308a\u307e\u3059\u3002<br \/>\u6628\u4eca\u306e\u30aa\u30fc\u30d7\u30f3LLM\u3067\u63a1\u7528\u3055\u308c\u3066\u3044\u308b\u30a2\u30fc\u30ad\u30c6\u30af\u30c1\u30e3\u3068\u306f\u7570\u306a\u308b\u70b9\u304c\u591a\u304f\u3001\u305d\u308c\u306b\u3088\u308a\u4ee5\u4e0b\u3067\u8ff0\u3079\u308b\u3088\u3046\u306b\u5b66\u7fd2\u3092\u884c\u3046\u4e0a\u3067\u306e\u30cf\u30fc\u30c9\u30eb\u304c\u4e0a\u304c\u3063\u3066\u3044\u307e\u3059\u3002<\/p>\n<ol data-line=\"33\" class=\"code-line\">\n<li data-line=\"33\" class=\"code-line\">\n<strong>bias\u9805\u306e\u5b58\u5728<\/strong>: Llama-2\u4ee5\u964d\u3001\u591a\u304f\u306eOpenLLM\u3067\u306fMLP, Attention\u3068\u3082\u306bbias\u9805\u304c\u306a\u3044\u306e\u304c\u4e00\u822c\u7684\u3067\u3057\u305f\u3002\u3057\u304b\u3057\u3001gpt-oss\u3067\u306f\u3001GPT-2\u306e\u6642\u4ee3\u3068\u540c\u69d8\u306bbias\u9805\u304c\u5b58\u5728\u3057\u3066\u3044\u307e\u3059\u3002<\/li>\n<li data-line=\"34\" class=\"code-line\">\n<strong>QK Norm\u306e\u6b20\u5982<\/strong>: Qwen3\u306b\u3082\u5c0e\u5165\u3055\u308c\u3066\u3044\u308b\u3088\u3046\u306b\u6628\u4eca\u306eLLM\u3067\u306f\u5b66\u7fd2\u5b89\u5b9a\u5316\u306e\u305f\u3081\u306bQK Norm\u3092\u5165\u308c\u308b\u3053\u3068\u304c\u5897\u3048\u3066\u3044\u307e\u3059\u304c\u3001gpt-oss\u3067\u306f\u5c0e\u5165\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002<\/li>\n<li data-line=\"35\" class=\"code-line\">\n<strong>self-attention sink(learnable softmax)\u306e\u5b58\u5728<\/strong>: \u5c0e\u5165\u80cc\u666f\u306a\u3069\u306b\u3064\u3044\u3066\u306e\u89e3\u8aac\u306f\u63a7\u3048\u307e\u3059\u304c\u3001softmax\u306e\u5206\u6bcd\u306b\u5b66\u7fd2\u53ef\u80fd\u306a\u30d0\u30a4\u30a2\u30b9\u9805\u304c\u5c0e\u5165\u3055\u308c\u3066\u3044\u307e\u3059\u3002<\/li>\n<\/ol>\n<p data-line=\"37\" class=\"code-line\">\u4e0a\u8a18\u306e\u3088\u3046\u306a\u30a2\u30fc\u30ad\u30c6\u30af\u30c1\u30e3\u306e\u5909\u66f4\u304c\u30e2\u30c7\u30eb\u6027\u80fd\u306b\u53ca\u307c\u3057\u3066\u3044\u308b\u5f71\u97ff\u306f\u5927\u304d\u304f\u306a\u3044\u3068\u63a8\u6e2c\u3055\u308c\u307e\u3059\u304c\u3001\u5b66\u7fd2\u3092\u884c\u3046\u4e0a\u3067\u306f\u3001\u3068\u304f\u306b3\u756a\u76ee\u306e\u70b9\u304c\u5f0a\u5bb3\u3068\u306a\u308a\u307e\u3059\u3002<\/p>\n<aside class=\"msg message\"><span class=\"msg-symbol\">!<\/span><\/p>\n<div class=\"msg-content\">\n<p data-line=\"41\" class=\"code-line\">attention sink \u306b\u3064\u3044\u3066\u306f\u3069\u3053\u3067\u78ba\u8a8d\u3067\u304d\u307e\u3059\u304b\uff1f<\/p>\n<blockquote data-line=\"43\" class=\"code-line\">\n<p data-line=\"43\" class=\"code-line\">Each attention head has a learned bias in the denominator of the softmax, similar to off-by-one attention and attention sinks<\/p>\n<\/blockquote>\n<p data-line=\"45\" class=\"code-line\">\u306e\u3088\u3046\u306a\u8a18\u8ff0\u304c<a target=\"_blank\" href=\"https:\/\/arxiv.org\/pdf\/2508.10925\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">gpt-oss-120b &amp; gpt-oss-20b Model Card<\/a>\u3067\u78ba\u8a8d\u3067\u304d\u307e\u3059\u3002<\/p>\n<p data-line=\"47\" class=\"code-line\">\u307e\u305f\u3001huggingface\u306e<a target=\"_blank\" href=\"https:\/\/huggingface.co\/openai\/gpt-oss-20b\/tree\/main?show_file_info=model.safetensors.index.json\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">\u30e2\u30c7\u30eb\u69cb\u9020<\/a>\u3067\u78ba\u8a8d\u3059\u308b\u3068\u4ee5\u4e0b\u306e\u3088\u3046\u306b<code>self_attn.sinks<\/code>\u304c\u78ba\u8a8d\u3067\u304d\u307e\u3059\u3002<br \/><img decoding=\"async\" src=\"https:\/\/storage.googleapis.com\/zenn-user-upload\/43431baacaf4-20251104.png\" alt=\"\" class=\"md-img\" loading=\"lazy\"\/><\/p>\n<p data-line=\"50\" class=\"code-line\">\u307e\u305f\u3001Gro Kobayashi\u3055\u3093\u306e<a target=\"_blank\" href=\"https:\/\/x.com\/goro_koba\/status\/1954480023890780587?s=20\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">Tweet<\/a>\u3082\u53c2\u8003\u306b\u306a\u308b\u3068\u601d\u3044\u307e\u3059\u3002<br \/><img decoding=\"async\" src=\"https:\/\/storage.googleapis.com\/zenn-user-upload\/2f6d51dcc228-20251112.png\" alt=\"\" class=\"md-img\" loading=\"lazy\"\/><br \/>(Tweet\u306b\u6dfb\u4ed8\u3055\u308c\u3066\u3044\u308b\u753b\u50cf\u3088\u308a)<br \/><a target=\"_blank\" href=\"https:\/\/x.com\/goro_koba\/status\/1954480023890780587?s=20\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">https:\/\/x.com\/goro_koba\/status\/1954480023890780587?s=20<\/a><\/p>\n<\/div>\n<\/aside>\n<aside class=\"msg message\"><span class=\"msg-symbol\">!<\/span><\/p>\n<div class=\"msg-content\">\n<p data-line=\"58\" class=\"code-line\">\u306a\u305c\u3001\u5f0a\u5bb3\u306b\u306a\u308b\u306e\u3067\u3057\u3087\u3046\u304b\uff1f<\/p>\n<p data-line=\"60\" class=\"code-line\">\u73fe\u4ee3\u306eLLM\u5b66\u7fd2\u306f\u3001PyTorch\u3067\u7c21\u5358\u306b\u8a18\u8ff0\u3067\u304d\u308b\u5f62\u5f0f\u306e\u5b9f\u88c5\u3060\u3051\u3067\u52d5\u3044\u3066\u3044\u308b\u8a33\u3067\u306f\u7121\u3044\u305f\u3081\u3067\u3059\u3002<br \/>Attention\u306e\u52b9\u7387\u7684\u306a\u8a08\u7b97\u306e\u305f\u3081\u306b<strong>FlashAttention<\/strong>\u3084<a target=\"_blank\" href=\"https:\/\/github.com\/NVIDIA\/TransformerEngine\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">TransformerEngine<\/a>\u306e<strong>FusedAttention<\/strong>\u3092\u5229\u7528\u3057\u305f\u308a\u3001Long Context\u5b66\u7fd2\u3092\u52b9\u7387\u7684\u306b\u884c\u3046\u305f\u3081\u306bContext Parallelism support\u3092\u60f3\u5b9a\u3057\u305fTransformerEngine\u306ecustom GEMM\u5b9f\u88c5\u3092\u5229\u7528\u3057\u305f\u308a\u3068\u3001\u591a\u304f\u306e\u30e9\u30a4\u30d6\u30e9\u30ea\u306b\u4f9d\u5b58\u3057\u3066\u3044\u307e\u3059\u3002<br \/>\u307e\u305f\u3001\u305d\u308c\u3089\u306e\u30e9\u30a4\u30d6\u30e9\u30ea\u306fC++, CUDA C++\u3067\u8a18\u8ff0\u3055\u308c\u3066\u304a\u308a\u3001backend\u5b9f\u88c5\u306e\u5927\u90e8\u5206\u306f\u975ePython\u3067\u69cb\u6210\u3055\u308c\u3066\u3044\u307e\u3059\u3002<\/p>\n<p data-line=\"64\" class=\"code-line\">\u305d\u306e\u305f\u3081\u3001\u5c0f\u898f\u6a21\u306a\u5b9f\u9a13\u3067\u306fPyTorch\u3067\u6570\u884c\u66f8\u304f\u3060\u3051\u306e\u5909\u66f4\u304c\u3001\u5927\u898f\u6a21\u5b66\u7fd2\u306b\u304a\u3044\u3066\u5b66\u7fd2\u901f\u5ea6\u3092\u78ba\u4fdd\u3059\u308b\u305f\u3081\u306b\u306f\u3001\u8907\u6570\u306e\u4f9d\u5b58\u30e9\u30a4\u30d6\u30e9\u30ea\u306b\u307e\u305f\u304c\u308b\u4fee\u6b63\u306e\u9023\u9396\u3092\u5f15\u304d\u8d77\u3053\u3057\u307e\u3059\u3002(\u4fee\u6b63\u5f8c\u3082\u30d1\u30d5\u30a9\u30fc\u30de\u30f3\u30b9\u30c1\u30e5\u30fc\u30cb\u30f3\u30b0\u306a\u3069\u306b\u6642\u9593\u3092\u8981\u3059\u308b\u5834\u5408\u304c\u591a\u304f\u3001PyTorch\u30ec\u30d9\u30eb\u3067\u5b9f\u88c5\u3092\u5909\u66f4\u3059\u308b\u306e\u3068\u306f\u8a33\u304c\u9055\u3046\u30b3\u30b9\u30c8\u304c\u304b\u304b\u308a\u307e\u3059\u3002)<\/p>\n<\/div>\n<\/aside>\n<aside class=\"msg message\"><span class=\"msg-symbol\">!<\/span><\/p>\n<div class=\"msg-content\">\n<p data-line=\"70\" class=\"code-line\">bias\u306b\u3064\u3044\u3066\u7c21\u5358\u306b\u78ba\u8a8d\u3059\u308b\u306b\u306f\u3069\u3046\u3059\u308c\u3070\u3088\u3044\u3067\u3057\u3087\u3046\u304b\uff1f<\/p>\n<p data-line=\"72\" class=\"code-line\">\u4ee5\u4e0b\u306b\u3001Llama-3.1-8B\u3068gpt-oss-20b\u306e<code>model.safetensors.index.json<\/code>\u3092\u793a\u3057\u307e\u3059\u3002<br \/><code>model.layers.0.mp<\/code>\u3068<code>model.layers.0.mlp.experts<\/code>\u3092\u6bd4\u8f03\u3059\u308b\u3068MLP\u5c64\u306b\u304a\u3044\u3066\u3001<code>model.layers.0.mlp.experts.down_proj_bias<\/code>, <code>model.layers.0.mlp.experts.gate_up_proj_bias<\/code>\u306e\u5b58\u5728\u304cgpt-oss\u5074\u306b\u5b58\u5728\u3059\u308b\u3053\u3068\u3092\u78ba\u8a8d\u3067\u304d\u308b\u304b\u3068\u601d\u3044\u307e\u3059\u3002<\/p>\n<p data-line=\"75\" class=\"code-line\"><img decoding=\"async\" src=\"https:\/\/storage.googleapis.com\/zenn-user-upload\/fb7c6d4354cf-20251119.png\" alt=\"\" class=\"md-img\" loading=\"lazy\"\/><br \/>(<a target=\"_blank\" href=\"https:\/\/huggingface.co\/meta-llama\/Llama-3.1-8B\/tree\/main?show_file_info=model.safetensors.index.json\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">Llama-3.1-8B<\/a>\u306emodel.safetensors.index.json)<\/p>\n<p data-line=\"78\" class=\"code-line\"><img decoding=\"async\" src=\"https:\/\/storage.googleapis.com\/zenn-user-upload\/04938c495895-20251119.png\" alt=\"\" class=\"md-img\" loading=\"lazy\"\/><br \/>(<a target=\"_blank\" href=\"https:\/\/huggingface.co\/openai\/gpt-oss-20b\/tree\/main?show_file_info=model.safetensors.index.json\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">gpt-oss-20b<\/a>\u306emodel.safetensors.index.json)<\/p>\n<\/div>\n<\/aside>\n<h2 id=\"ngc\" data-line=\"83\" class=\"code-line\">\n NGC<\/h2>\n<p data-line=\"85\" class=\"code-line\">gpt-oss\u3092\u5b66\u7fd2\u3059\u308b\u305f\u3081\u306e\u65b9\u6cd5\u3092\u8abf\u3079\u308b\u3068<a target=\"_blank\" href=\"https:\/\/docs.nvidia.com\/nemo-framework\/user-guide\/latest\/llms\/gpt_oss.html\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">NVIDIA NeMo Framework User Guide<\/a>\u304c\u76ee\u306b\u4ed8\u304f\u3067\u3057\u3087\u3046\u3002<br \/>\u305d\u3053\u3067\u306f\u3001NVIDIA\u306e NeMo Framework\u7528\u306e\u30b3\u30f3\u30c6\u30ca\u304c\u7d39\u4ecb\u3055\u308c\u3066\u304a\u308a\u3001<a target=\"_blank\" href=\"https:\/\/catalog.ngc.nvidia.com\/orgs\/nvidia\/containers\/nemo?version=25.07.gpt_oss\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">25.07.gpt_oss<\/a>\u3068\u3044\u3046\u30b3\u30f3\u30c6\u30ca\u3092\u5229\u7528\u3059\u308c\u3070\u975e\u5e38\u306b\u7c21\u5358\u306b\u5b66\u7fd2\u53ef\u80fd\u3067\u3042\u308b\u304b\u306e\u3088\u3046\u306b\u66f8\u304b\u308c\u3066\u3044\u307e\u3059\u3002(\u5c0f\u898f\u6a21\u306afinetuning\u3067\u3042\u308c\u3070\u305d\u306e\u901a\u308a\u3067\u3059)<\/p>\n<p data-line=\"88\" class=\"code-line\">\u3057\u304b\u3057\u3001Long Context\u5b66\u7fd2\u3084\u3001Continual Pre-Training(\u7d99\u7d9a\u4e8b\u524d\u5b66\u7fd2)\u3092\u884c\u3046\u3068\u306a\u308b\u3068\u305d\u3046\u3082\u3044\u304d\u307e\u305b\u3093\u3002\u672c\u7bc0\u3067\u306f\u3001NGC\u3092\u5229\u7528\u3057\u3066\u5b66\u7fd2\u74b0\u5883\u3092\u6574\u3048\u308b\u69d8\u5b50\u306b\u3064\u3044\u3066\u89e3\u8aac\u3092\u884c\u3044\u307e\u3059\u3002<\/p>\n<h3 id=\"%E5%AE%9F%E8%A3%85%E3%81%AE%E6%91%98%E5%87%BA\" data-line=\"90\" class=\"code-line\">\n \u5b9f\u88c5\u306e\u6458\u51fa<\/h3>\n<p data-line=\"92\" class=\"code-line\">\u4ee5\u4e0b\u3067\u306f\u30b9\u30d1\u30b3\u30f3(\u30b9\u30fc\u30d1\u30fc\u30b3\u30f3\u30d4\u30e5\u30fc\u30bf\u30fc)\u3067\u306e\u4f5c\u696d\u3092\u60f3\u5b9a\u3057\u3066\u3001singularity\u3092\u5229\u7528\u3057\u3066\u4f5c\u696d\u3092\u884c\u3044\u307e\u3059\u3002\u9069\u6642\u3001\u30b3\u30de\u30f3\u30c9\u3092\u304a\u4f7f\u3044\u306e\u74b0\u5883\u306b\u5408\u308f\u305b\u3066\u8aad\u307f\u66ff\u3048\u3066\u304f\u3060\u3055\u3044\u3002<\/p>\n<p data-line=\"94\" class=\"code-line\">\u307e\u305a\u3001\u4ee5\u4e0b\u306e\u3088\u3046\u306b<code>25.07.gpt_oss.def<\/code>\u3092\u4f5c\u6210\u3057\u3001singularity build\u3092\u884c\u3044\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<p><span class=\"code-block-filename\">25.07.gpt_oss.def<\/span><\/p>\n<pre class=\"language-bash\"><code class=\"language-bash code-line\" data-line=\"96\">Bootstrap: <span class=\"token function\">docker<\/span>\nFrom: nvcr.io\/nvidia\/nemo:25.07.gpt_oss\n\n%post\n  pip <span class=\"token function\">install<\/span> --no-cache-dir wandb transformers datasets jsonlines tqdm\n<\/code><\/pre>\n<\/div>\n<p data-line=\"104\" class=\"code-line\">\u306a\u304a\u3001build\u3092\u884c\u3046\u969b\u306f\u3001Lustre, NFS\u4e0a\u3067\u306f\u306a\u304f\u3067\u304d\u308b\u3060\u3051<code>\/scratch<\/code>\u306a\u3069\u306eLocal Storage\u3067\u884c\u3046\u3053\u3068\u3067\u51e6\u7406\u6642\u9593\u3092\u77ed\u7e2e\u3059\u308b\u3053\u3068\u3092\u30aa\u30b9\u30b9\u30e1\u3057\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-bash\"><code class=\"language-bash code-line\" data-line=\"106\"><span class=\"token builtin class-name\">cd<\/span> \/scratch\n<span class=\"token builtin class-name\">export<\/span> <span class=\"token assign-left variable\">SINGULARITY_TMPDIR<\/span><span class=\"token operator\">=<\/span>\/scratch\/tmp\n\nsingularity build <span class=\"token parameter variable\">--sandbox<\/span> <span class=\"token number\">25.07<\/span>.gpt_oss <span class=\"token number\">25.07<\/span>.gpt_oss.def\n<\/code><\/pre>\n<\/div>\n<p data-line=\"113\" class=\"code-line\"><code>.sif<\/code>\u3092\u4f5c\u6210\u3059\u308b\u3068Read only\u306b\u306a\u3063\u3066\u3057\u307e\u3046\u306e\u3067\u3001\u4ee5\u4e0b\u3067\u4f5c\u696d\u3092\u884c\u3046\u3053\u3068\u3092\u60f3\u5b9a\u3057\u3066sandbox\u3092\u4f5c\u6210\u3057\u307e\u3059\u3002<\/p>\n<p data-line=\"116\" class=\"code-line\">\u3053\u306e\u30b3\u30f3\u30c6\u30ca\u306e\u4e2d\u3067\u5229\u7528\u3055\u308c\u3066\u3044\u308bNeMo\u3084Megatron-LM\u306fGitHub\u306b\u3066tag\u6253\u3061\u3055\u308c\u3066\u3044\u308b\u3082\u306e\u3068\u7570\u306a\u308b\u5b9f\u88c5\u306e\u305f\u3081\u3001git\u7ba1\u7406\u4e0b\u306b\u7f6e\u304f\u305f\u3081\u306b\u30b3\u30f3\u30c6\u30ca\u304b\u3089\u5b9f\u88c5\u3092\u6458\u51fa\u3057\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-bash\"><code class=\"language-bash code-line\" data-line=\"118\">singularity shell <span class=\"token parameter variable\">--bind<\/span> \/path\/to\/your:\/path\/to\/your <span class=\"token number\">25.07<\/span>.gpt_oss\nSingularity<span class=\"token operator\">&gt;<\/span>\n<\/code><\/pre>\n<\/div>\n<p data-line=\"123\" class=\"code-line\">\u30b3\u30f3\u30c6\u30ca\u5185\u306b\u5165\u308a\u6b21\u7b2c\u3001<code>\/opt\/NeMo\/<\/code>, <code>\/opt\/megatron-lm<\/code>\u306b\u3042\u308b\u5b9f\u88c5\u3092\u30b3\u30f3\u30c6\u30ca\u5916\u306e\u30d1\u30b9\u306bcopy\u3057\u3066\u3001\u30b3\u30f3\u30c6\u30ca\u304b\u3089\u629c\u3051\u3066\u3082\u30a2\u30af\u30bb\u30b9\u3067\u304d\u308b\u3088\u3046\u306b\u3057\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-bash\"><code class=\"language-bash code-line\" data-line=\"125\"><span class=\"token function\">cp<\/span> <span class=\"token parameter variable\">-R<\/span> \/opt\/NeMo \/path\/to\/your\n<span class=\"token function\">cp<\/span> <span class=\"token parameter variable\">-R<\/span> \/opt\/megatron-lm \/path\/to\/your\n<\/code><\/pre>\n<\/div>\n<p data-line=\"130\" class=\"code-line\">\u306a\u304a\u3001\u4e0b\u8a18\u3067\u8ff0\u3079\u308b\u4fee\u6b63\u3092\u884c\u3063\u305fMegatron-LM\u3092GitHub\u4e0a\u3067\u516c\u958b\u3057\u3066\u3044\u307e\u3059\u306e\u3067\u3001\u3054\u81ea\u7531\u306b\u3054\u5229\u7528\u304f\u3060\u3055\u3044\u3002<\/p>\n<p data-line=\"132\" class=\"code-line\"><span class=\"embed-block zenn-embedded zenn-embedded-card\"><iframe id=\"zenn-embedded__c03d5bab24e7b\" src=\"https:\/\/embed.zenn.studio\/card#zenn-embedded__c03d5bab24e7b\" data-content=\"https%3A%2F%2Fgithub.com%2Fokoge-kaz%2Fgpt-oss-megatron-lm\" frameborder=\"0\" scrolling=\"no\" loading=\"lazy\"><\/iframe><\/span><a target=\"_blank\" href=\"https:\/\/github.com\/okoge-kaz\/gpt-oss-megatron-lm\" style=\"display:none\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">https:\/\/github.com\/okoge-kaz\/gpt-oss-megatron-lm<\/a><\/p>\n<aside class=\"msg message\"><span class=\"msg-symbol\">!<\/span><\/p>\n<div class=\"msg-content\">\n<p data-line=\"136\" class=\"code-line\">singularity\u3092\u5229\u7528\u3057\u305fbuild\u5468\u308a\u306b\u3064\u3044\u3066\u3082\u3046\u5c11\u3057\u8a73\u3057\u304f\u6559\u3048\u3066\u304f\u308c\u307e\u305b\u3093\u304b\uff1f<\/p>\n<p data-line=\"138\" class=\"code-line\">\u4ee5\u4e0b\u306e\u8a18\u4e8b\u306b\u3066\u3001singularity\u306e<code>.deff<\/code>,<code>.sif<\/code>, sandbox\u306e\u5f79\u5272\u3068<code>\/scratch<\/code>\u9818\u57df\u3067build\u3059\u308b\u7406\u7531\u306a\u3069\u3092\u7c21\u5358\u306b\u8a18\u8f09\u3057\u3066\u3044\u307e\u3059\u3002\u3054\u53c2\u7167\u304f\u3060\u3055\u3044\u3002<br \/><a target=\"_blank\" href=\"https:\/\/zenn.dev\/turing_motors\/articles\/04eed10b0aafe9#singularity\" target=\"_blank\">https:\/\/zenn.dev\/turing_motors\/articles\/04eed10b0aafe9#singularity<\/a><\/p>\n<\/div>\n<\/aside>\n<h3 id=\"%E5%A4%89%E6%9B%B4%E3%81%AEapply\" data-line=\"143\" class=\"code-line\">\n \u5909\u66f4\u306eapply<\/h3>\n<p data-line=\"145\" class=\"code-line\"><a target=\"_blank\" href=\"https:\/\/github.com\/NVIDIA\/Megatron-LM\/pull\/2038\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">\u3053\u3061\u3089\u306ePull Request<\/a>\u306b\u304b\u304b\u308c\u3066\u3044\u308b\u3088\u3046\u306b\u3001\u3053\u306e\u30b3\u30f3\u30c6\u30ca\u5185\u306e\u5b9f\u88c5\u3092\u5229\u7528\u3059\u308b\u3068Yarn\u306e\u5b9f\u88c5\u304cHuggingFace\u5b9f\u88c5\u3068\u4e56\u96e2\u3057\u3066\u3057\u307e\u3063\u3066\u3044\u308b\u3088\u3046\u3067\u3059\u3002\u305d\u3053\u3067\u3001\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u30b3\u30f3\u30c6\u30ca\u5185\u306emegatron-lm\u306e\u5b9f\u88c5\u3092\u4fee\u6b63\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-bash\"><code class=\"language-bash code-line\" data-line=\"147\">Singularity<span class=\"token operator\">&gt;<\/span> <span class=\"token function\">vim<\/span> \/opt\/megatron-lm\/megatron\/core\/models\/common\/embeddings\/rope_utils.py\nSingularity<span class=\"token operator\">&gt;<\/span> <span class=\"token function\">rm<\/span> \/opt\/megatron-lm\/megatron\/core\/models\/common\/embeddings\/rope_utils.py\nSingularity<span class=\"token operator\">&gt;<\/span> <span class=\"token function\">vim<\/span> \/opt\/megatron-lm\/megatron\/core\/models\/common\/embeddings\/rope_utils.py\nSingularity<span class=\"token operator\">&gt;<\/span> <span class=\"token function\">rm<\/span> \/opt\/megatron-lm\/megatron\/core\/models\/common\/embeddings\/yarn_rotary_pos_embedding.py\nSingularity<span class=\"token operator\">&gt;<\/span> <span class=\"token function\">vim<\/span> \/opt\/megatron-lm\/megatron\/core\/models\/common\/embeddings\/yarn_rotary_pos_embedding.py\nSingularity<span class=\"token operator\">&gt;<\/span> <span class=\"token function\">rm<\/span> \/opt\/megatron-lm\/megatron\/core\/transformer\/dot_product_attention.py\nSingularity<span class=\"token operator\">&gt;<\/span> <span class=\"token function\">vim<\/span> \/opt\/megatron-lm\/megatron\/core\/transformer\/dot_product_attention.py\nSingularity<span class=\"token operator\">&gt;<\/span> <span class=\"token function\">rm<\/span> \/opt\/megatron-lm\/megatron\/core\/transformer\/utils.py\nSingularity<span class=\"token operator\">&gt;<\/span> <span class=\"token function\">vim<\/span> \/opt\/megatron-lm\/megatron\/core\/transformer\/utils.py\n<\/code><\/pre>\n<\/div>\n<p data-line=\"159\" class=\"code-line\">\u306a\u304a\u3001\u4fee\u6b63\u304c\u5fc5\u8981\u306a\u5dee\u5206\u306f\u4ee5\u4e0b\u306e\u3068\u304a\u308a\u3067\u3059\u3002<br style=\"display:none\"\/><br \/>\n<span class=\"embed-block zenn-embedded zenn-embedded-card\"><iframe id=\"zenn-embedded__c66412e83107b\" src=\"https:\/\/embed.zenn.studio\/card#zenn-embedded__c66412e83107b\" data-content=\"https%3A%2F%2Fgithub.com%2Fokoge-kaz%2Fgpt-oss-megatron-lm%2Fcommit%2F01b3824fe9d81b211b8aee6bfb35bd92169f8eb9\" frameborder=\"0\" scrolling=\"no\" loading=\"lazy\"><\/iframe><\/span><a target=\"_blank\" href=\"https:\/\/github.com\/okoge-kaz\/gpt-oss-megatron-lm\/commit\/01b3824fe9d81b211b8aee6bfb35bd92169f8eb9\" style=\"display:none\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">https:\/\/github.com\/okoge-kaz\/gpt-oss-megatron-lm\/commit\/01b3824fe9d81b211b8aee6bfb35bd92169f8eb9<\/a><\/p>\n<aside class=\"msg message\"><span class=\"msg-symbol\">!<\/span><\/p>\n<div class=\"msg-content\">\n<p data-line=\"164\" class=\"code-line\">Pull Request\u304c\u51fa\u3066\u3044\u308b\u5143\u306e\u5b9f\u88c5\u3068\u30b3\u30f3\u30c6\u30ca\u5185\u306e\u5b9f\u88c5\u3068\u306e\u4e56\u96e2\u304c\u3042\u308b\u306e\u3067\u3001\u3053\u306ePull Request\u3068\u306f\u95a2\u4fc2\u304c\u306a\u3044API\u306e\u5909\u66f4\u306a\u3069\u3082\u8003\u616e\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002\u4e0a\u8a18\u306e\u5dee\u5206\u306b\u305f\u3069\u308a\u7740\u304f\u307e\u3067\u306b\u305d\u308c\u306a\u308a\u306b\u82e6\u52b4\u3057\u307e\u3057\u305f\u3002<\/p>\n<\/div>\n<\/aside>\n<h2 id=\"nemo\" data-line=\"168\" class=\"code-line\">\n NeMo<\/h2>\n<p data-line=\"170\" class=\"code-line\">\u30b3\u30f3\u30c6\u30ca\u5185\u304b\u3089\u6458\u51fa\u3057\u3066\u304d\u305fNeMo\u3092Git\u7ba1\u7406\u4e0b\u306b\u304a\u304d\u3001\u5b9f\u88c5\u3092\u884c\u3063\u3066\u3044\u304d\u307e\u3059\u3002<\/p>\n<h3 id=\"%E7%8F%BE%E7%8A%B6\" data-line=\"172\" class=\"code-line\">\n \u73fe\u72b6<\/h3>\n<p data-line=\"174\" class=\"code-line\">\u307e\u305a\u3001\u73fe\u72b6\u3092\u78ba\u8a8d\u3057\u307e\u3057\u3087\u3046\u3002<\/p>\n<p data-line=\"176\" class=\"code-line\">gpt-oss\u306e\u5b66\u7fd2\u3092NeMo\u3067\u884c\u3046\u306b\u306f\u3001HuggingFace\u5f62\u5f0f\u3067\u516c\u958b\u3055\u308c\u3066\u3044\u308bcheckpoint\u3092NeMo\u3067\u8aad\u307f\u8fbc\u3081\u308b\u3088\u3046\u306bNeMo\u5f62\u5f0f\u306echeckpoint\u306b\u5909\u63db\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002<br \/>\u307e\u305f\u3001\u30b3\u30f3\u30c6\u30ca\u304b\u3089\u6458\u51fa\u3057\u305fNeMo\u5185\u306e<code>tutorials\/llm\/gpt-oss\/ticket-routing-lora\/gpt-oss-lora.ipynb<\/code>\u306b\u3042\u308btutorial\u306fLoRA SFT\u3057\u304b\u89e3\u8aac\u3057\u3066\u3044\u306a\u3044\u3070\u304b\u308a\u304b\u3001<code>nemo\/collections\/llm\/recipes\/gpt_oss_20b.py<\/code>\u306e\u5b9f\u88c5\u3082pretrain\u7528\u306e\u5b9f\u88c5\u3092\u30b5\u30dd\u30fc\u30c8\u3057\u3066\u3044\u307e\u305b\u3093\u3002<\/p>\n<p data-line=\"179\" class=\"code-line\">\u305d\u306e\u305f\u3081\u3001\u307e\u3060\u307e\u3060\u9053\u306e\u308a\u306f\u9060\u305d\u3046\u3067\u3059&#8230;\u3002<br \/>\uff11\u3064\uff11\u3064\u7247\u4ed8\u3051\u3066\u3044\u304d\u307e\u3057\u3087\u3046\u3002<\/p>\n<aside class=\"msg message\"><span class=\"msg-symbol\">!<\/span><\/p>\n<div class=\"msg-content\">\n<p data-line=\"184\" class=\"code-line\">\u306a\u305c\u3001checkpoint convert(\u30c1\u30a7\u30c3\u30af\u30dd\u30a4\u30f3\u30c8\u5909\u63db)\u304c\u5fc5\u8981\u306a\u306e\u3067\u3057\u3087\u3046\u304b\uff1f<\/p>\n<p data-line=\"186\" class=\"code-line\">\u307e\u305acheckpoint convert\u306f\u4ee5\u4e0b\u306e\u3088\u3046\u306a\u624b\u9806\u306b\u306a\u3063\u3066\u3044\u307e\u3059\u3002<br \/>\u591a\u304f\u306e\u30e2\u30c7\u30eb\u306fHuggingFace(HF)\u306b\u3066\u3001huggingface\u306e\u5b9a\u3081\u308b\u30e2\u30c7\u30eb\u5b9f\u88c5\u306e\u5f62\u5f0f(HF format)\u3067\u914d\u5e03\u3055\u308c\u3066\u3044\u307e\u3059\u3002\u3053\u306e\u5f62\u5f0f\u306f\u3001\u7c21\u5358\u306b\u5229\u7528\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u304c\u3001NVIDIA NeMo\u306a\u3069\u306e\u9ad8\u901f\u306a\u5b66\u7fd2\u3092\u884c\u3046\u305f\u3081\u306e\u30e9\u30a4\u30d6\u30e9\u30ea\u306e\u5f62\u5f0f\u3068\u306f\u5b9f\u88c5\u304c\u7570\u306a\u308b\u305f\u3081\u3001\u305d\u306e\u307e\u307e\u3067\u306f\u5229\u7528\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u305b\u3093\u3002\u305d\u3053\u3067\u3001checkpoint convert\u304c\u5fc5\u8981\u306b\u306a\u308b\u306e\u3067\u3059\u3002<\/p>\n<p data-line=\"189\" class=\"code-line\">\u4ee5\u4e0b\u306e\u56f3\u306f\u3001gpt-oss-20b\u3092\u7528\u3044\u3066\u3001\u65e5\u672c\u8a9e\u3084\u30c9\u30e1\u30a4\u30f3\u77e5\u8b58\u3092\u7372\u5f97\u3055\u305b\u308b\u305f\u3081\u306e\u30c7\u30fc\u30bf\u3067\u5b66\u7fd2(Training)\u3092\u884c\u3044\u3001\u6700\u7d42\u7684\u306b\u914d\u5e03\u3059\u308b\u5f62\u5f0f\u3067\u3042\u308bHF format\u306b\u623b\u3059\u307e\u3067\u306e\u69d8\u5b50\u3092\u56f3\u793a\u3057\u3066\u3044\u307e\u3059\u3002<\/p>\n<p data-line=\"191\" class=\"code-line\"><img decoding=\"async\" src=\"https:\/\/storage.googleapis.com\/zenn-user-upload\/9d0ee43007a8-20251112.png\" alt=\"\" class=\"md-img\" loading=\"lazy\"\/><\/p>\n<\/div>\n<\/aside>\n<h3 id=\"hf--%3E-nemo\" data-line=\"196\" class=\"code-line\">\n hf -&gt; nemo<\/h3>\n<p data-line=\"198\" class=\"code-line\">\u516c\u5f0f\u306e\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8\u306bconvert\u30b9\u30af\u30ea\u30d7\u30c8\u306e\u4f7f\u3044\u65b9\u304c\u66f8\u3044\u3066\u3042\u308b\u306e\u3067\u3059\u304c\u3001\u6b63\u76f4\u5206\u304b\u308a\u3065\u3089\u3044\u3067\u3059\u3002<br \/>\u4ee5\u4e0b\u306e\u3088\u3046\u306bconvert script\u3092\u5b9f\u88c5\u3057\u3001\u5229\u7528\u3059\u308b\u3068\u7c21\u5358\u306b\u4f7f\u7528\u3067\u304d\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<p><span class=\"code-block-filename\">experiments\/ckpt-convert\/hf-to-nemo\/gpt-oss.py<\/span><\/p>\n<pre class=\"language-python\"><code class=\"language-python code-line\" data-line=\"201\"><span class=\"token keyword\">import<\/span> argparse\n<span class=\"token keyword\">from<\/span> nemo<span class=\"token punctuation\">.<\/span>collections <span class=\"token keyword\">import<\/span> llm\n\n<span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">==<\/span> <span class=\"token string\">\"__main__\"<\/span><span class=\"token punctuation\">:<\/span>\n    parser <span class=\"token operator\">=<\/span> argparse<span class=\"token punctuation\">.<\/span>ArgumentParser<span class=\"token punctuation\">(<\/span>description<span class=\"token operator\">=<\/span><span class=\"token string\">\"Convert Hugging Face GPT-OSS checkpoints to NeMo format.\"<\/span><span class=\"token punctuation\">)<\/span>\n    parser<span class=\"token punctuation\">.<\/span>add_argument<span class=\"token punctuation\">(<\/span>\n        <span class=\"token string\">\"--model-size\"<\/span><span class=\"token punctuation\">,<\/span>\n        <span class=\"token builtin\">type<\/span><span class=\"token operator\">=<\/span><span class=\"token builtin\">str<\/span><span class=\"token punctuation\">,<\/span>\n        choices<span class=\"token operator\">=<\/span><span class=\"token punctuation\">[<\/span><span class=\"token string\">\"20B\"<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">\"120B\"<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span>\n        required<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span>\n        <span class=\"token builtin\">help<\/span><span class=\"token operator\">=<\/span><span class=\"token string\">\"Size of the GPT-OSS model to convert (20B or 120B).\"<\/span><span class=\"token punctuation\">,<\/span>\n    <span class=\"token punctuation\">)<\/span>\n    parser<span class=\"token punctuation\">.<\/span>add_argument<span class=\"token punctuation\">(<\/span>\n        <span class=\"token string\">\"--hf-checkpoint-path\"<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token builtin\">type<\/span><span class=\"token operator\">=<\/span><span class=\"token builtin\">str<\/span><span class=\"token punctuation\">,<\/span> required<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token builtin\">help<\/span><span class=\"token operator\">=<\/span><span class=\"token string\">\"Path to the Hugging Face GPT-OSS checkpoint.\"<\/span>\n    <span class=\"token punctuation\">)<\/span>\n    parser<span class=\"token punctuation\">.<\/span>add_argument<span class=\"token punctuation\">(<\/span>\n        <span class=\"token string\">\"--nemo-output-path\"<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token builtin\">type<\/span><span class=\"token operator\">=<\/span><span class=\"token builtin\">str<\/span><span class=\"token punctuation\">,<\/span> required<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token builtin\">help<\/span><span class=\"token operator\">=<\/span><span class=\"token string\">\"Path to save the converted NeMo checkpoint.\"<\/span>\n    <span class=\"token punctuation\">)<\/span>\n    args <span class=\"token operator\">=<\/span> parser<span class=\"token punctuation\">.<\/span>parse_args<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>\n    \n    <span class=\"token keyword\">if<\/span> args<span class=\"token punctuation\">.<\/span>model_size <span class=\"token operator\">==<\/span> <span class=\"token string\">\"20B\"<\/span><span class=\"token punctuation\">:<\/span>\n        llm<span class=\"token punctuation\">.<\/span>import_ckpt<span class=\"token punctuation\">(<\/span>\n            model<span class=\"token operator\">=<\/span>llm<span class=\"token punctuation\">.<\/span>GPTOSSModel<span class=\"token punctuation\">(<\/span>llm<span class=\"token punctuation\">.<\/span>GPTOSSConfig20B<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span>\n            source<span class=\"token operator\">=<\/span><span class=\"token string\">\"hf:\/\/\"<\/span> <span class=\"token operator\">+<\/span> args<span class=\"token punctuation\">.<\/span>hf_checkpoint_path<span class=\"token punctuation\">,<\/span>\n            output_path<span class=\"token operator\">=<\/span>args<span class=\"token punctuation\">.<\/span>nemo_output_path<span class=\"token punctuation\">,<\/span>\n        <span class=\"token punctuation\">)<\/span>\n    \n    <span class=\"token keyword\">elif<\/span> args<span class=\"token punctuation\">.<\/span>model_size <span class=\"token operator\">==<\/span> <span class=\"token string\">\"120B\"<\/span><span class=\"token punctuation\">:<\/span>\n        llm<span class=\"token punctuation\">.<\/span>import_ckpt<span class=\"token punctuation\">(<\/span>\n            model<span class=\"token operator\">=<\/span>llm<span class=\"token punctuation\">.<\/span>GPTOSSModel<span class=\"token punctuation\">(<\/span>llm<span class=\"token punctuation\">.<\/span>GPTOSSConfig120B<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span>\n            source<span class=\"token operator\">=<\/span><span class=\"token string\">\"hf:\/\/\"<\/span> <span class=\"token operator\">+<\/span> args<span class=\"token punctuation\">.<\/span>hf_checkpoint_path<span class=\"token punctuation\">,<\/span>\n            output_path<span class=\"token operator\">=<\/span>args<span class=\"token punctuation\">.<\/span>nemo_output_path<span class=\"token punctuation\">,<\/span>\n        <span class=\"token punctuation\">)<\/span>\n    <span class=\"token keyword\">else<\/span><span class=\"token punctuation\">:<\/span>\n        <span class=\"token keyword\">raise<\/span> ValueError<span class=\"token punctuation\">(<\/span><span class=\"token string\">\"Unsupported model size. Choose either '20B' or '120B'.\"<\/span><span class=\"token punctuation\">)<\/span>\n\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f\"Conversion complete! NeMo checkpoint saved at <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>args<span class=\"token punctuation\">.<\/span>nemo_output_path<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">\"<\/span><\/span><span class=\"token punctuation\">)<\/span>\n\n<\/code><\/pre>\n<\/div>\n<p data-line=\"242\" class=\"code-line\">\u4e0a\u8a18\u306e\u3088\u3046\u306b\u5b9f\u88c5\u3057\u305f\u30b9\u30af\u30ea\u30d7\u30c8\u3092\u5229\u7528\u3057\u3066\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u3059\u308b\u3053\u3068\u3067\u3001HuggingFace format\u306echeckpoint\u3092NeMo\u5f62\u5f0f\u306echeckpoint\u306bconvert\u3059\u308b\u3053\u3068\u304c\u53ef\u80fd\u3067\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-bash\"><code class=\"language-bash code-line\" data-line=\"244\"><span class=\"token assign-left variable\">HF_CHECKPOINT_PATH<\/span><span class=\"token operator\">=<\/span><span class=\"token string\">\"\/path\/tp\/gpt-oss-20b\"<\/span>\n<span class=\"token assign-left variable\">NEMO_OUTPUT_PATH<\/span><span class=\"token operator\">=<\/span><span class=\"token string\">\"\/path\/to\/checkpoints\/hf-to-nemo\/gpt-oss-20B.nemo\"<\/span>\n<span class=\"token function\">mkdir<\/span> <span class=\"token parameter variable\">-p<\/span> <span class=\"token variable\"><span class=\"token variable\">$(<\/span><span class=\"token function\">dirname<\/span> $<span class=\"token punctuation\">{<\/span>NEMO_OUTPUT_PATH<span class=\"token punctuation\">}<\/span><span class=\"token variable\">)<\/span><\/span>\n\n<span class=\"token builtin class-name\">export<\/span> <span class=\"token assign-left variable\">NUMEXPR_MAX_THREADS<\/span><span class=\"token operator\">=<\/span><span class=\"token number\">192<\/span>\n\nsingularity <span class=\"token builtin class-name\">exec<\/span> <span class=\"token punctuation\">\\<\/span>\n  <span class=\"token parameter variable\">--nv<\/span> <span class=\"token punctuation\">\\<\/span>\n  <span class=\"token parameter variable\">--bind<\/span> \/path\/to:\/path\/to <span class=\"token punctuation\">\\<\/span>\n  <span class=\"token parameter variable\">--bind<\/span> \/tmp:\/tmp <span class=\"token punctuation\">\\<\/span>\n  \/path\/to\/25.07.gpt_oss.sif <span class=\"token punctuation\">\\<\/span>\n  python experiments\/ckpt-convert\/hf-to-nemo\/gpt-oss.py <span class=\"token punctuation\">\\<\/span>\n    --model-size 20B <span class=\"token punctuation\">\\<\/span>\n    --hf-checkpoint-path <span class=\"token variable\">${HF_CHECKPOINT_PATH}<\/span> <span class=\"token punctuation\">\\<\/span>\n    --nemo-output-path <span class=\"token variable\">${NEMO_OUTPUT_PATH}<\/span>\n<\/code><\/pre>\n<\/div>\n<p data-line=\"262\" class=\"code-line\">\u3053\u308c\u3067\u3001NeMo\u5f62\u5f0f\u306echeckpoint\u3092\u5f97\u308b\u3053\u3068\u304c\u51fa\u6765\u307e\u3057\u305f\u3002<\/p>\n<aside class=\"msg message\"><span class=\"msg-symbol\">!<\/span><\/p>\n<div class=\"msg-content\">\n<p data-line=\"266\" class=\"code-line\">\u516c\u5f0f\u306e\u624b\u9806\u304c\u5206\u304b\u308a\u306b\u304f\u3044\u3068\u304d\u306b\u3069\u306e\u3088\u3046\u306b\u4e0a\u8a18\u306e\u3088\u3046\u306a\u5b9f\u88c5\u3092\u5b9f\u73fe\u3057\u3066\u3044\u308b\u306e\u3067\u3059\u304b\uff1f<\/p>\n<p data-line=\"268\" class=\"code-line\">\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8\u304c\u5f53\u3066\u306b\u306a\u3089\u306a\u3044\u306e\u306f\u65e5\u5e38\u306a\u306e\u3067\u3001\u516c\u5f0f\u304c\u63d0\u4f9b\u3057\u3066\u3044\u308b\u5b9f\u88c5\u306e\u3046\u3061\u3001checkpoint convert\u306b\u95a2\u4fc2\u3057\u305d\u3046\u306a\u3082\u306e\u3092\u624b\u5f53\u305f\u308a\u6b21\u7b2c\u306b\u78ba\u8a8d\u3057\u3066\u529b\u6280\u3067\u5b9f\u73fe\u3057\u3066\u3044\u307e\u3059\u3002<\/p>\n<\/div>\n<\/aside>\n<h3 id=\"pretrain_recipe\" data-line=\"272\" class=\"code-line\">\n pretrain_recipe<\/h3>\n<p data-line=\"274\" class=\"code-line\"><code>nemo\/collections\/llm\/recipes\/gpt_oss_20b.py<\/code>, <code>nemo\/collections\/llm\/recipes\/gpt_oss_120b.py<\/code>\u3092\u898b\u308b\u3068pretrain recipe\u304c\u306a\u3044\u3053\u3068\u306b\u6c17\u3065\u304d\u307e\u3059\u3002(\u4ee5\u4e0b\u306e\u3088\u3046\u306afinetune recipe\u3057\u304b\u3042\u308a\u307e\u305b\u3093)<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-python\"><code class=\"language-python code-line\" data-line=\"276\"><span class=\"token decorator annotation punctuation\">@run<span class=\"token punctuation\">.<\/span>cli<span class=\"token punctuation\">.<\/span>factory<\/span><span class=\"token punctuation\">(<\/span>target<span class=\"token operator\">=<\/span>finetune<span class=\"token punctuation\">,<\/span> name<span class=\"token operator\">=<\/span>NAME<span class=\"token punctuation\">)<\/span>\n<span class=\"token keyword\">def<\/span> <span class=\"token function\">finetune_recipe<\/span><span class=\"token punctuation\">(<\/span>\n    <span class=\"token builtin\">dir<\/span><span class=\"token punctuation\">:<\/span> Optional<span class=\"token punctuation\">[<\/span><span class=\"token builtin\">str<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">=<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span>\n    resume_path<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">str<\/span> <span class=\"token operator\">=<\/span> <span class=\"token string\">\"openai\/gpt-oss-20b\"<\/span><span class=\"token punctuation\">,<\/span>\n    name<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">str<\/span> <span class=\"token operator\">=<\/span> <span class=\"token string\">\"default\"<\/span><span class=\"token punctuation\">,<\/span>\n    num_nodes<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n    num_gpus_per_node<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">8<\/span><span class=\"token punctuation\">,<\/span>\n    peft_scheme<span class=\"token punctuation\">:<\/span> Optional<span class=\"token punctuation\">[<\/span><span class=\"token builtin\">str<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">=<\/span> <span class=\"token string\">\"lora\"<\/span><span class=\"token punctuation\">,<\/span>\n    packed_sequence<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">bool<\/span> <span class=\"token operator\">=<\/span> <span class=\"token boolean\">False<\/span><span class=\"token punctuation\">,<\/span>\n<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">-<\/span><span class=\"token operator\">&gt;<\/span> run<span class=\"token punctuation\">.<\/span>Partial<span class=\"token punctuation\">:<\/span>\n<\/code><\/pre>\n<\/div>\n<p data-line=\"289\" class=\"code-line\">\u305d\u3053\u3067\u3001\u4ee5\u4e0b\u306e\u3088\u3046\u306bpretrain recipe\u3092\u5b9f\u88c5\u3057\u3066\u3044\u304d\u307e\u3059\u3002<br \/>\u3042\u304f\u307e\u3067\u4ee5\u4e0b\u306f\u5b9f\u88c5\u306e\u4e00\u4f8b\u3067\u3042\u308a\u3001\u30aa\u30d7\u30b7\u30e7\u30f3\u7b49\u3092\u30a4\u30b8\u3089\u306a\u3044\u5834\u5408\u306f\u3082\u3063\u3068\u7c21\u7d20\u5316\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u308b\u3068\u601d\u3044\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-python\"><code class=\"language-python code-line\" data-line=\"292\"><span class=\"token decorator annotation punctuation\">@run<span class=\"token punctuation\">.<\/span>cli<span class=\"token punctuation\">.<\/span>factory<\/span><span class=\"token punctuation\">(<\/span>target<span class=\"token operator\">=<\/span>pretrain<span class=\"token punctuation\">,<\/span> name<span class=\"token operator\">=<\/span>NAME<span class=\"token punctuation\">)<\/span>\n<span class=\"token keyword\">def<\/span> <span class=\"token function\">pretrain_recipe<\/span><span class=\"token punctuation\">(<\/span>\n    <span class=\"token builtin\">dir<\/span><span class=\"token punctuation\">:<\/span> Optional<span class=\"token punctuation\">[<\/span><span class=\"token builtin\">str<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">=<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span>\n    name<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">str<\/span> <span class=\"token operator\">=<\/span> <span class=\"token string\">\"default\"<\/span><span class=\"token punctuation\">,<\/span>\n    num_nodes<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n    num_gpus_per_node<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">8<\/span><span class=\"token punctuation\">,<\/span>\n    performance_mode<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">bool<\/span> <span class=\"token operator\">=<\/span> <span class=\"token boolean\">False<\/span><span class=\"token punctuation\">,<\/span>\n    tensor_parallel_size<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n    context_parallel_size<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n    expert_parallel_size<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n    pipeline_parallel_size<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n    sequence_parallelism<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">bool<\/span> <span class=\"token operator\">=<\/span> <span class=\"token boolean\">False<\/span><span class=\"token punctuation\">,<\/span>\n    seq_length<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">32768<\/span><span class=\"token punctuation\">,<\/span>\n    global_batch_size<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">256<\/span><span class=\"token punctuation\">,<\/span>\n    micro_batch_size<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n    lr<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">float<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">3e-4<\/span><span class=\"token punctuation\">,<\/span>\n    min_lr<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">float<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">3e-5<\/span><span class=\"token punctuation\">,<\/span>\n    train_steps<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">25000<\/span><span class=\"token punctuation\">,<\/span>\n    warmup_steps<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1000<\/span><span class=\"token punctuation\">,<\/span>\n    adam_beta1<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">float<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">0.9<\/span><span class=\"token punctuation\">,<\/span>\n    adam_beta2<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">float<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">0.95<\/span><span class=\"token punctuation\">,<\/span>\n    adam_eps<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">float<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1e-8<\/span><span class=\"token punctuation\">,<\/span>\n    weight_decay<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">float<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">0.1<\/span><span class=\"token punctuation\">,<\/span>\n    clip_grad<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">float<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1.0<\/span><span class=\"token punctuation\">,<\/span>\n    constant_step<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span>\n    fp8<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">str<\/span> <span class=\"token operator\">=<\/span> <span class=\"token string\">\"\"<\/span><span class=\"token punctuation\">,<\/span>\n    fn<span class=\"token punctuation\">:<\/span> Callable <span class=\"token operator\">=<\/span> pretrain<span class=\"token punctuation\">,<\/span>\n<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">-<\/span><span class=\"token operator\">&gt;<\/span> run<span class=\"token punctuation\">.<\/span>Partial<span class=\"token punctuation\">:<\/span>\n    recipe <span class=\"token operator\">=<\/span> run<span class=\"token punctuation\">.<\/span>Partial<span class=\"token punctuation\">(<\/span>\n        fn<span class=\"token punctuation\">,<\/span>\n        model<span class=\"token operator\">=<\/span>model<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span>\n        trainer<span class=\"token operator\">=<\/span>trainer<span class=\"token punctuation\">(<\/span>\n            num_nodes<span class=\"token operator\">=<\/span>num_nodes<span class=\"token punctuation\">,<\/span>\n            num_gpus_per_node<span class=\"token operator\">=<\/span>num_gpus_per_node<span class=\"token punctuation\">,<\/span>\n            tensor_parallelism<span class=\"token operator\">=<\/span>tensor_parallel_size<span class=\"token punctuation\">,<\/span>\n            context_parallelism<span class=\"token operator\">=<\/span>context_parallel_size<span class=\"token punctuation\">,<\/span>\n            pipeline_parallelism<span class=\"token operator\">=<\/span>pipeline_parallel_size<span class=\"token punctuation\">,<\/span>\n            sequence_parallelism<span class=\"token operator\">=<\/span>sequence_parallelism<span class=\"token punctuation\">,<\/span>\n            expert_parallel_size<span class=\"token operator\">=<\/span>expert_parallel_size<span class=\"token punctuation\">,<\/span>\n            fp8<span class=\"token operator\">=<\/span>fp8<span class=\"token punctuation\">,<\/span>\n            callbacks<span class=\"token operator\">=<\/span><span class=\"token punctuation\">[<\/span>\n                run<span class=\"token punctuation\">.<\/span>Config<span class=\"token punctuation\">(<\/span>\n                    TimingCallback<span class=\"token punctuation\">,<\/span>\n                    log_tokens_per_sec<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span>\n                <span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span>\n            <span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span>\n        <span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span>\n        data<span class=\"token operator\">=<\/span>run<span class=\"token punctuation\">.<\/span>Config<span class=\"token punctuation\">(<\/span>\n            MockDataModule<span class=\"token punctuation\">,<\/span>\n            seq_length<span class=\"token operator\">=<\/span>seq_length<span class=\"token punctuation\">,<\/span>\n            global_batch_size<span class=\"token operator\">=<\/span>global_batch_size<span class=\"token punctuation\">,<\/span>\n            micro_batch_size<span class=\"token operator\">=<\/span>micro_batch_size<span class=\"token punctuation\">,<\/span>\n        <span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span>\n        log<span class=\"token operator\">=<\/span>default_log<span class=\"token punctuation\">(<\/span><span class=\"token builtin\">dir<\/span><span class=\"token operator\">=<\/span><span class=\"token builtin\">dir<\/span><span class=\"token punctuation\">,<\/span> name<span class=\"token operator\">=<\/span>name<span class=\"token punctuation\">,<\/span> tensorboard_logger<span class=\"token operator\">=<\/span>tensorboard_logger<span class=\"token punctuation\">(<\/span>name<span class=\"token operator\">=<\/span>name<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span>\n        optim<span class=\"token operator\">=<\/span>distributed_fused_adam_with_cosine_annealing<span class=\"token punctuation\">(<\/span>\n            train_steps<span class=\"token operator\">=<\/span>train_steps<span class=\"token punctuation\">,<\/span>\n            warmup_steps<span class=\"token operator\">=<\/span>warmup_steps<span class=\"token punctuation\">,<\/span>\n            constant_steps<span class=\"token operator\">=<\/span>constant_step<span class=\"token punctuation\">,<\/span>\n            adam_beta1<span class=\"token operator\">=<\/span>adam_beta1<span class=\"token punctuation\">,<\/span>\n            adam_beta2<span class=\"token operator\">=<\/span>adam_beta2<span class=\"token punctuation\">,<\/span>\n            adam_eps<span class=\"token operator\">=<\/span>adam_eps<span class=\"token punctuation\">,<\/span>\n            max_lr<span class=\"token operator\">=<\/span>lr<span class=\"token punctuation\">,<\/span>\n            min_lr<span class=\"token operator\">=<\/span>min_lr<span class=\"token punctuation\">,<\/span>\n            weight_decay<span class=\"token operator\">=<\/span>weight_decay<span class=\"token punctuation\">,<\/span>\n            clip_grad<span class=\"token operator\">=<\/span>clip_grad<span class=\"token punctuation\">,<\/span>\n        <span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span>\n        resume<span class=\"token operator\">=<\/span>default_resume<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span>\n    <span class=\"token punctuation\">)<\/span>\n<\/code><\/pre>\n<\/div>\n<p data-line=\"363\" class=\"code-line\">\u6b21\u306b\u3001<code>trainer()<\/code>\u3082\u5b9f\u88c5\u3057\u3066\u3057\u307e\u3044\u307e\u3059\u3002<br \/>(\u3053\u3061\u3089\u3082\u4ee5\u4e0b\u306f\u4e00\u4f8b\u3067\u3059\u306e\u3067\u3001\u7528\u9014\u306b\u5408\u308f\u305b\u3066\u5b9f\u88c5\u7c92\u5ea6\u306f\u5909\u66f4\u3057\u3066\u304f\u3060\u3055\u3044)<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-python\"><code class=\"language-python code-line\" data-line=\"366\"><span class=\"token keyword\">def<\/span> <span class=\"token function\">trainer<\/span><span class=\"token punctuation\">(<\/span>\n    tensor_parallelism<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n    pipeline_parallelism<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n    pipeline_parallelism_type<span class=\"token punctuation\">:<\/span> Optional<span class=\"token punctuation\">[<\/span>torch<span class=\"token punctuation\">.<\/span>dtype<span class=\"token punctuation\">]<\/span> <span class=\"token operator\">=<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span>\n    virtual_pipeline_parallelism<span class=\"token punctuation\">:<\/span> Optional<span class=\"token punctuation\">[<\/span><span class=\"token builtin\">int<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">=<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span>\n    context_parallelism<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">,<\/span>\n    expert_parallel_size<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">4<\/span><span class=\"token punctuation\">,<\/span>\n    sequence_parallelism<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">bool<\/span> <span class=\"token operator\">=<\/span> <span class=\"token boolean\">False<\/span><span class=\"token punctuation\">,<\/span>\n    num_nodes<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n    num_gpus_per_node<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">8<\/span><span class=\"token punctuation\">,<\/span>\n    max_steps<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">int<\/span> <span class=\"token operator\">=<\/span> <span class=\"token number\">1168251<\/span><span class=\"token punctuation\">,<\/span>\n    fp8<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">str<\/span> <span class=\"token operator\">=<\/span> <span class=\"token string\">\"\"<\/span><span class=\"token punctuation\">,<\/span>\n    callbacks<span class=\"token punctuation\">:<\/span> Optional<span class=\"token punctuation\">[<\/span><span class=\"token builtin\">list<\/span><span class=\"token punctuation\">[<\/span>run<span class=\"token punctuation\">.<\/span>Config<span class=\"token punctuation\">[<\/span>Callback<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">=<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span>\n<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">-<\/span><span class=\"token operator\">&gt;<\/span> run<span class=\"token punctuation\">.<\/span>Config<span class=\"token punctuation\">[<\/span>nl<span class=\"token punctuation\">.<\/span>Trainer<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">:<\/span>\n    strategy <span class=\"token operator\">=<\/span> run<span class=\"token punctuation\">.<\/span>Config<span class=\"token punctuation\">(<\/span>\n        nl<span class=\"token punctuation\">.<\/span>MegatronStrategy<span class=\"token punctuation\">,<\/span>\n        tensor_model_parallel_size<span class=\"token operator\">=<\/span>tensor_parallelism<span class=\"token punctuation\">,<\/span>\n        pipeline_model_parallel_size<span class=\"token operator\">=<\/span>pipeline_parallelism<span class=\"token punctuation\">,<\/span>\n        pipeline_dtype<span class=\"token operator\">=<\/span>pipeline_parallelism_type<span class=\"token punctuation\">,<\/span>\n        virtual_pipeline_model_parallel_size<span class=\"token operator\">=<\/span>virtual_pipeline_parallelism<span class=\"token punctuation\">,<\/span>\n        context_parallel_size<span class=\"token operator\">=<\/span>context_parallelism<span class=\"token punctuation\">,<\/span>\n        expert_model_parallel_size<span class=\"token operator\">=<\/span>expert_parallel_size<span class=\"token punctuation\">,<\/span>\n        sequence_parallel<span class=\"token operator\">=<\/span>sequence_parallelism<span class=\"token punctuation\">,<\/span>\n        gradient_as_bucket_view<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span>\n        ckpt_async_save<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span>\n        ckpt_parallel_load<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span>\n        ddp<span class=\"token operator\">=<\/span>run<span class=\"token punctuation\">.<\/span>Config<span class=\"token punctuation\">(<\/span>\n            DistributedDataParallelConfig<span class=\"token punctuation\">,<\/span>\n            check_for_nan_in_grad<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span>\n            grad_reduce_in_fp32<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span>\n            overlap_grad_reduce<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span>\n            overlap_param_gather<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span>\n            average_in_collective<span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span>\n            data_parallel_sharding_strategy<span class=\"token operator\">=<\/span><span class=\"token string\">\"optim_grads_params\"<\/span><span class=\"token punctuation\">,<\/span>  \n        <span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span>\n        fsdp<span class=\"token operator\">=<\/span><span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span>  \n    <span class=\"token punctuation\">)<\/span>\n\n    precision <span class=\"token operator\">=<\/span> <span class=\"token boolean\">None<\/span>\n    <span class=\"token keyword\">if<\/span> fp8 <span class=\"token operator\">==<\/span> <span class=\"token string\">\"current\"<\/span><span class=\"token punctuation\">:<\/span>\n        precision <span class=\"token operator\">=<\/span> nemotron_h_bf16_with_fp8_current_scaling_mixed<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>\n    <span class=\"token keyword\">elif<\/span> fp8 <span class=\"token operator\">==<\/span> <span class=\"token string\">\"blockwise\"<\/span><span class=\"token punctuation\">:<\/span>\n        precision <span class=\"token operator\">=<\/span> bf16_with_fp8_subchannel_scaling_mixed<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>\n    <span class=\"token keyword\">else<\/span><span class=\"token punctuation\">:<\/span>\n        precision <span class=\"token operator\">=<\/span> bf16_mixed<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>\n\n    trainer <span class=\"token operator\">=<\/span> run<span class=\"token punctuation\">.<\/span>Config<span class=\"token punctuation\">(<\/span>\n        nl<span class=\"token punctuation\">.<\/span>Trainer<span class=\"token punctuation\">,<\/span>\n        accelerator<span class=\"token operator\">=<\/span><span class=\"token string\">\"gpu\"<\/span><span class=\"token punctuation\">,<\/span>\n        accumulate_grad_batches<span class=\"token operator\">=<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n        callbacks<span class=\"token operator\">=<\/span>callbacks<span class=\"token punctuation\">,<\/span>\n        devices<span class=\"token operator\">=<\/span>num_gpus_per_node<span class=\"token punctuation\">,<\/span>\n        limit_test_batches<span class=\"token operator\">=<\/span><span class=\"token number\">50<\/span><span class=\"token punctuation\">,<\/span>\n        limit_val_batches<span class=\"token operator\">=<\/span><span class=\"token number\">32<\/span><span class=\"token punctuation\">,<\/span>\n        log_every_n_steps<span class=\"token operator\">=<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span>\n        max_steps<span class=\"token operator\">=<\/span>max_steps<span class=\"token punctuation\">,<\/span>\n        num_nodes<span class=\"token operator\">=<\/span>num_nodes<span class=\"token punctuation\">,<\/span>\n        plugins<span class=\"token operator\">=<\/span>precision<span class=\"token punctuation\">,<\/span>\n        strategy<span class=\"token operator\">=<\/span>strategy<span class=\"token punctuation\">,<\/span>\n        use_distributed_sampler<span class=\"token operator\">=<\/span><span class=\"token boolean\">False<\/span><span class=\"token punctuation\">,<\/span>\n        val_check_interval<span class=\"token operator\">=<\/span><span class=\"token number\">2000<\/span><span class=\"token punctuation\">,<\/span>\n        enable_progress_bar<span class=\"token operator\">=<\/span><span class=\"token boolean\">False<\/span><span class=\"token punctuation\">,<\/span>\n    <span class=\"token punctuation\">)<\/span>\n\n    <span class=\"token keyword\">return<\/span> trainer\n<\/code><\/pre>\n<\/div>\n<p data-line=\"434\" class=\"code-line\">Wandb Logger\u306b\u6e21\u3059CallBack\u306e\u5b9f\u88c5\u3084\u3001checkpoint save\u30d1\u30b9\u3092Megatron-LM\u4e92\u63db\u306b\u3059\u308b\u305f\u3081\u306e\u5b9f\u88c5\u3001\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u95a2\u4fc2\u306a\u3069\u8a73\u7d30\u306f\u307e\u3060\u3042\u308b\u306e\u3067\u3059\u304c\u3001\u3053\u3053\u3067\u306f\u5272\u611b\u3057\u307e\u3059\u3002<\/p>\n<h3 id=\"%E7%8F%BE%E7%8A%B6-1\" data-line=\"436\" class=\"code-line\">\n \u73fe\u72b6<\/h3>\n<p data-line=\"438\" class=\"code-line\">\u3053\u3053\u307e\u3067\u6765\u308c\u3070\u5b66\u7fd2\u3067\u304d\u308b\u3088\u3046\u306b\u306a\u3063\u305f\u3068\u601d\u3044\u305f\u3044\u306e\u3067\u3059\u304c\u3001\u305d\u3046\u3082\u3044\u304d\u307e\u305b\u3093\u3002<br \/>\u73fe\u72b6\u3067\u306f\u3001<strong>learnable softmax<\/strong>(gpt-oss\u72ec\u81ea\u306e\u6a5f\u69cb)\u306b\u5bfe\u5fdc\u3057\u305f<strong>DotProductAttention<\/strong>\u304c<strong>FlashAttention<\/strong>\u3001install\u6e08\u307f\u306e<strong>TransformerEngine<\/strong>\u306b<strong>\u5b58\u5728\u3057\u306a\u3044<\/strong>\u306e\u3067\u3001Context Parallelism\u3092\u5229\u7528\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u305a\u3001\u5b66\u7fd2\u53ef\u80fd\u306acontext size\u304c8,192\u3042\u305f\u308a\u306b\u5236\u9650\u3055\u308c\u3066\u3057\u307e\u3044\u307e\u3059\u3002<\/p>\n<p data-line=\"441\" class=\"code-line\">\u5b9f\u969b\u3001\u7121\u7406\u3084\u308a\u5b66\u7fd2\u3057\u3088\u3046\u3068context parallel size &gt; 1\u3068\u3057\u3066\u307f\u308b\u3068\u4ee5\u4e0b\u306e\u3088\u3046\u306a\u30a8\u30e9\u30fc\u304c\u51fa\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre><code class=\"code-line\" data-line=\"442\">[rank62]:   File \"\/usr\/local\/lib\/python3.12\/dist-packages\/transformer_engine\/pytorch\/attention\/dot_product_attention\/dot_product_attention.py\", line 1370, in forward\n[rank62]:     raise ValueError(\n[rank62]: ValueError: No dot product attention backend is available for the provided inputs. Please run with NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 to find out the reasons for disabling all backends.\n<\/code><\/pre>\n<\/div>\n<p data-line=\"448\" class=\"code-line\">\u30a8\u30e9\u30fc\u6587\u306b\u3042\u308b\u3088\u3046\u306b\u3001DEBUG\u30d5\u30e9\u30b0\u3092\u4ed8\u3051\u3066\u5b9f\u884c\u3059\u308b\u3068\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u306a\u308a\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre><code class=\"code-line\" data-line=\"449\">export NEMO_LOG_TRAIN_LOSS=1\nexport NEMO_LOG_MEMORY_USAGE=1\n<\/code><\/pre>\n<\/div>\n<p data-line=\"454\" class=\"code-line\">\u4ee5\u4e0b\u306e\u30ed\u30b0\u3067\u306f\u3001<a target=\"_blank\" href=\"https:\/\/github.com\/NVIDIA\/TransformerEngine\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">TransformerEngine<\/a>\u306b\u304a\u3044\u3066\u3069\u306e\u3088\u3046\u306a\u8a2d\u5b9a\u304c\u6e21\u3055\u308c\u3066\u3001\u305d\u306e\u7d50\u679c <strong>Attention Backend<\/strong>\u3068\u3057\u3066\u4f55\u304c\u9078\u629e\u3055\u308c\u305f\u304b\u306e\u30ed\u30b0\u304c\u51fa\u3066\u3044\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre><code class=\"code-line\" data-line=\"455\">DEBUG:DotProductAttention:Running with config={'transformer_engine_version': '2.xxx.xxx', 'compute_capability': 'sm90', 'flash_attn_version': '2.7.3', 'flash_attn_3_version': 'not installed', 'cudnn_version': '9.13.0', 'qkv_type': <class>, 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'sbhd_sbhd_sbhd', 'batch_size': 1, 'num_heads': 64, 'num_gqa_groups': 8, 'max_seqlen_q': 32768, 'max_seqlen_kv': 32768, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'causal', 'window_size': (128, 0), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': True, 'cp_comm_type': 'a2a', 'deterministic': False, 'is_training': False, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None}, 'inference_params': None, 'softmax_type': 'learnable', 'return_max_logit': False}\n[DEBUG    | DotProductAttention]: Disabling FusedAttention as no backend supports the provided input\nDEBUG:DotProductAttention:Disabling FusedAttention as no backend supports the provided input\n[DEBUG    | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=False, UnfusedDotProductAttention=False}\nDEBUG:DotProductAttention:Available backends = {FlashAttention=False, FusedAttention=False, UnfusedDotProductAttention=False}\n[DEBUG    | DotProductAttention]: Disabling FusedAttention as no backend supports the provided input\nDEBUG:DotProductAttention:Disabling FusedAttention as no backend supports the provided input\n[DEBUG    | DotProductAttention]: Disabling FlashAttention 2 due to NVTE_FLASH_ATTN=0\nDEBUG:DotProductAttention:Disabling FlashAttention 2 due to NVTE_FLASH_ATTN=0\n[DEBUG    | DotProductAttention]: Disabling UnfusedDotProductAttention due to NVTE_UNFUSED_ATTN=0\nDEBUG:DotProductAttention:Disabling UnfusedDotProductAttention due to NVTE_UNFUSED_ATTN=0\n[DEBUG    | DotProductAttention]: Disabling FlashAttention for softmax_type = learnable\n[DEBUG    | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=False, UnfusedDotProductAttention=False}\nDEBUG:DotProductAttention:Available backends = {FlashAttention=False, FusedAttention=False, UnfusedDotProductAttention=False}\n[DEBUG    | DotProductAttention]: Disabling FlashAttention 2 due to NVTE_FLASH_ATTN=0\nDEBUG:DotProductAttention:Disabling FlashAttention 2 due to NVTE_FLASH_ATTN=0\n[DEBUG    | DotProductAttention]: Selected backend = NoBackend\nDEBUG:DotProductAttention:Selected backend = NoBackend\n[DEBUG    | DotProductAttention]: Disabling UnfusedDotProductAttention due to NVTE_UNFUSED_ATTN=0\nDEBUG:DotProductAttention:Disabling UnfusedDotProductAttention due to NVTE_UNFUSED_ATTN=0\n<\/class><\/code><\/pre>\n<\/div>\n<p data-line=\"478\" class=\"code-line\">\u30ed\u30b0\u306b\u8a18\u8f09\u3055\u308c\u3066\u3044\u308b\u3088\u3046\u306b\u3001FlashAttention, FusedAttention, UnFusedAttention\u306e\u3069\u308c\u3082\u5229\u7528\u3059\u308b\u3053\u3068\u304c\u51fa\u6765\u306a\u304b\u3063\u305f\u306e\u3067\u3001\u30a8\u30e9\u30fc\u304c\u767a\u751f\u3057\u3066\u3044\u307e\u3059\u3002<\/p>\n<h3 id=\"transformerengine%E3%81%AEupdate\" data-line=\"480\" class=\"code-line\">\n TransformerEngine\u306eupdate<\/h3>\n<p data-line=\"482\" class=\"code-line\">GPT-OSS\u306e<strong>learnable softmax<\/strong>\u306b\u5bfe\u5fdc\u3059\u308b\u5b9f\u88c5\u3092\u81ea\u524d\u3067\u5b9f\u88c5\u3057\u3088\u3046\u3068\u3057\u307e\u3057\u305f\u304c\u3001\u3053\u306e\u7a0b\u5ea6\u306e\u5b9f\u88c5\u3067\u3042\u308c\u3070NVIDIA\u306eTransformerEngine team\u304c\u5b9f\u88c5\u3057\u3066\u3044\u306a\u3044\u306f\u305a\u306f\u306a\u3044\u3068\u601d\u3044\u76f4\u3057\u3001\u8abf\u67fb\u3092\u59cb\u3081\u307e\u3057\u305f\u3002\u3059\u308b\u3068\u4ee5\u4e0b\u306ePull Request\u3092\u767a\u898b\u3057\u307e\u3057\u305f\u3002<\/p>\n<p data-line=\"484\" class=\"code-line\"><span class=\"embed-block zenn-embedded zenn-embedded-card\"><iframe id=\"zenn-embedded__c1186f8e824ba\" src=\"https:\/\/embed.zenn.studio\/card#zenn-embedded__c1186f8e824ba\" data-content=\"https%3A%2F%2Fgithub.com%2FNVIDIA%2FTransformerEngine%2Fpull%2F2148\" frameborder=\"0\" scrolling=\"no\" loading=\"lazy\"><\/iframe><\/span><a target=\"_blank\" href=\"https:\/\/github.com\/NVIDIA\/TransformerEngine\/pull\/2148\" style=\"display:none\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">https:\/\/github.com\/NVIDIA\/TransformerEngine\/pull\/2148<\/a><\/p>\n<p data-line=\"486\" class=\"code-line\">\u5b9f\u88c5\u3092\u898b\u308b\u3068communication type\u304c<code>p2p<\/code>\u3067\u306f\u306a\u304f<code>a2a<\/code>(=all to all)\u306eContext Parallel\u5bfe\u5fdc\u306elearnable softmax\u5411\u3051\u306eFusedAttention\u304c\u3042\u308b\u3053\u3068\u304c\u5224\u660e\u3057\u307e\u3057\u305f\u3002<br \/>\u305d\u3053\u3067\u3001sandbox\u306b\u5165\u308a\u3001TransformerEngine\u306eversion\u3092update\u3059\u308b\u3053\u3068\u3067\u5bfe\u5fdc\u3057\u307e\u3057\u305f\u3002<\/p>\n<aside class=\"msg message\"><span class=\"msg-symbol\">!<\/span><\/p>\n<div class=\"msg-content\">\n<p data-line=\"491\" class=\"code-line\">TransformerEngine\u306eversion\u3092update\u3059\u308b\u3060\u3051\u3067\u5341\u5206\u306a\u306e\u3067\u3057\u3087\u3046\u304b\uff1f<\/p>\n<p data-line=\"493\" class=\"code-line\">\u3044\u3044\u3048\u3001<code>nemo\/collections\/llm\/gpt\/model\/gpt_oss.py<\/code>\u306eGPTOSSConfig\u306b\u4ee5\u4e0b\u3092\u8ffd\u52a0\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-python\"><code class=\"language-python code-line\" data-line=\"495\">    attention_backend<span class=\"token punctuation\">:<\/span> AttnBackend <span class=\"token operator\">=<\/span> AttnBackend<span class=\"token punctuation\">.<\/span>fused\n    cp_comm_type<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">str<\/span> <span class=\"token operator\">=<\/span> <span class=\"token string\">\"a2a\"<\/span>\n<\/code><\/pre>\n<\/div>\n<p data-line=\"500\" class=\"code-line\">\u3053\u308c\u306f\u3001Attention Backend\u304c\u81ea\u52d5\u3067\u9078\u3070\u308c\u308b\u306e\u3067\u306f\u306a\u304f\u3001learnable softmax\u306b\u5bfe\u5fdc\u3057\u3066\u3044\u308bFusedAttention\u3092\u5229\u7528\u3059\u308b\u3088\u3046\u306b\u5f37\u5236\u3059\u308b\u305f\u3081\u306e\u63aa\u7f6e\u3068\u3001context parallel\u306ecommunication type\u306fdefault\u3067\u306fp2p\u306a\u306e\u3067\u3001\u3053\u308c\u3092a2a\u306b\u3059\u308b\u305f\u3081\u306e\u63aa\u7f6e\u3067\u3059\u3002<\/p>\n<p data-line=\"502\" class=\"code-line\">\u4e0a\u8a18\u3092\u5fd8\u308c\u308b\u3068\u4ee5\u4e0b\u306e\u3088\u3046\u306a\u30e1\u30c3\u30bb\u30fc\u30b8\u304c\u51fa\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre><code class=\"code-line\" data-line=\"504\">[DEBUG    | DotProductAttention]: Disabling FlashAttention for softmax_type = learnable\n[DEBUG    | DotProductAttention]: Disabling FusedAttention for context parallelism with softmax_type = learnable and cp_comm_type = p2p\nDEBUG:DotProductAttention:Disabling FusedAttention for context parallelism with softmax_type = learnable and cp_comm_type = p2p\n<\/code><\/pre>\n<\/div>\n<p data-line=\"510\" class=\"code-line\">\u3053\u306e\u30e1\u30c3\u30bb\u30fc\u30b8\u304c\u51fa\u529b\u3055\u308c\u3066\u3044\u308b\u306e\u306f\u4ee5\u4e0b\u306e\u7b87\u6240\u3067\u3059\u3002<br \/><a target=\"_blank\" href=\"https:\/\/github.com\/NVIDIA\/TransformerEngine\/blob\/e7227af98070ebfcdb08b7f0a99bb87abe7b8532\/transformer_engine\/pytorch\/attention\/dot_product_attention\/utils.py#L721-L729\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">https:\/\/github.com\/NVIDIA\/TransformerEngine\/blob\/e7227af98070ebfcdb08b7f0a99bb87abe7b8532\/transformer_engine\/pytorch\/attention\/dot_product_attention\/utils.py#L721-L729<\/a><\/p>\n<\/div>\n<\/aside>\n<h3 id=\"cudnn%E3%81%AEupdate\" data-line=\"516\" class=\"code-line\">\n cuDNN\u306eupdate<\/h3>\n<p data-line=\"518\" class=\"code-line\">\u3053\u308c\u3067\u5b8c\u4e86\u304b\u3068\u601d\u3044\u304d\u3084\u3001\u305d\u3046\u3067\u306f\u3042\u308a\u307e\u305b\u3093\u3002<br \/><strong>\u307e\u3060\u52d5\u304d\u307e\u305b\u3093\u3002<\/strong> \u4ee5\u4e0b\u306e\u3088\u3046\u306a\u30c7\u30d0\u30c3\u30b0\u30e1\u30c3\u30bb\u30fc\u30b8\u304c\u51fa\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre><code class=\"code-line\" data-line=\"521\">[DEBUG    | DotProductAttention]: Disabling FusedAttention as no backend supports the provided input\nDEBUG:DotProductAttention:Disabling FusedAttention as no backend supports the provided input\n[DEBUG    | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=False, UnfusedDotProductAttention=False}\nDEBUG:DotProductAttention:Available backends = {FlashAttention=False, FusedAttention=False, UnfusedDotProductAttention=False}\n[DEBUG    | DotProductAttention]: Disabling FusedAttention as no backend supports the provided input\nDEBUG:DotProductAttention:Disabling FusedAttention as no backend supports the provided input\n<\/code><\/pre>\n<\/div>\n<p data-line=\"530\" class=\"code-line\"><code>Disabling FusedAttention as no backend supports the provided input<\/code>\u304c\u51fa\u308b\u7b87\u6240\u3092\u63a2\u3059\u3068\u4ee5\u4e0b\u306e\u5b9f\u88c5\u304c\u898b\u3064\u304b\u308a\u307e\u3059\u3002<\/p>\n<p data-line=\"532\" class=\"code-line\"><span class=\"embed-block zenn-embedded zenn-embedded-github\"><iframe id=\"zenn-embedded__9666ef2235239\" src=\"https:\/\/embed.zenn.studio\/github#zenn-embedded__9666ef2235239\" data-content=\"https%3A%2F%2Fgithub.com%2FNVIDIA%2FTransformerEngine%2Fblob%2Fe7227af98070ebfcdb08b7f0a99bb87abe7b8532%2Ftransformer_engine%2Fcommon%2Ffused_attn%2Ffused_attn.cpp%23L373-L376\" frameborder=\"0\" scrolling=\"no\" loading=\"lazy\"><\/iframe><\/span><a target=\"_blank\" href=\"https:\/\/github.com\/NVIDIA\/TransformerEngine\/blob\/e7227af98070ebfcdb08b7f0a99bb87abe7b8532\/transformer_engine\/common\/fused_attn\/fused_attn.cpp#L373-L376\" style=\"display:none\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">https:\/\/github.com\/NVIDIA\/TransformerEngine\/blob\/e7227af98070ebfcdb08b7f0a99bb87abe7b8532\/transformer_engine\/common\/fused_attn\/fused_attn.cpp#L373-L376<\/a><\/p>\n<p data-line=\"534\" class=\"code-line\">\u3064\u307e\u308a\u3001cuDNN\u306eversion\u304c91301(=9.13.1)\u672a\u6e80\u3067\u3042\u308b\u305f\u3081\u3001FusedAttention\u304c\u5229\u7528\u3067\u304d\u3066\u3044\u306a\u3044\u3068\u3044\u3046\u3053\u3068\u3067\u3059\u3002\u3082\u3046\u4e00\u5ea6\u5148\u7a0b\u306e<a target=\"_blank\" href=\"https:\/\/github.com\/NVIDIA\/TransformerEngine\/pull\/2148\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">PullRequest<\/a>\u3092\u898b\u308b\u3068Description\u306b\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u3042\u308a\u307e\u3059\u3002<\/p>\n<blockquote data-line=\"536\" class=\"code-line\">\n<p data-line=\"536\" class=\"code-line\">FusedAttention backend for FP16\/BF16 and BSHD\/SBHD: cuDNN 9.13.1+ and cudnn-frontend 1.14.1<\/p>\n<\/blockquote>\n<p data-line=\"538\" class=\"code-line\">\u305d\u306e\u3046\u3048\u3067\u3001cuDNN\u306e<a target=\"_blank\" href=\"https:\/\/developer.nvidia.com\/cudnn-archive\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">\u30ea\u30ea\u30fc\u30b9\u30ea\u30b9\u30c8<\/a>\u3092\u78ba\u8a8d\u3059\u308b\u30689.13.1\u306f\u3064\u3044\u6700\u8fd1\u51fa\u305f\u3053\u3068\u304c\u5224\u660e\u3057\u307e\u3057\u305f\u3002<\/p>\n<p data-line=\"540\" class=\"code-line\"><img decoding=\"async\" src=\"https:\/\/storage.googleapis.com\/zenn-user-upload\/a8e9a9f9f16b-20251104.png\" alt=\"\" class=\"md-img\" loading=\"lazy\"\/><\/p>\n<p data-line=\"542\" class=\"code-line\">Singularity\u5185\u90e8\u306ecuDNN version\u3092\u78ba\u8a8d\u3059\u308b\u3068\u4ee5\u4e0b\u306e\u3088\u3046\u306b9.13.0\u3067\u3042\u308b\u3053\u3068\u304c\u5206\u304b\u308a\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-bash\"><code class=\"language-bash code-line\" data-line=\"543\">Singularity<span class=\"token operator\">&gt;<\/span> <span class=\"token function\">ls<\/span> \/usr\/local\/cudnn\/lib64\/\nlibcudnn.so\t\tlibcudnn_adv_static.a\t  libcudnn_cnn_static_v9.a\t\t    libcudnn_engines_runtime_compiled.so\t   libcudnn_graph.so.9\t       libcudnn_heuristic.so.9.13.0    libcudnn_ops_static.a\nlibcudnn.so.9\t\tlibcudnn_adv_static_v9.a  libcudnn_engines_precompiled.so\t    libcudnn_engines_runtime_compiled.so.9\t   libcudnn_graph.so.9.13.0    libcudnn_heuristic_static.a     libcudnn_ops_static_v9.a\nlibcudnn.so.9.13.0\tlibcudnn_cnn.so\t\t  libcudnn_engines_precompiled.so.9\t    libcudnn_engines_runtime_compiled.so.9.13.0    libcudnn_graph_static.a     libcudnn_heuristic_static_v9.a\nlibcudnn_adv.so\t\tlibcudnn_cnn.so.9\t  libcudnn_engines_precompiled.so.9.13.0    libcudnn_engines_runtime_compiled_static.a\t   libcudnn_graph_static_v9.a  libcudnn_ops.so\nlibcudnn_adv.so.9\tlibcudnn_cnn.so.9.13.0\t  libcudnn_engines_precompiled_static.a     libcudnn_engines_runtime_compiled_static_v9.a  libcudnn_heuristic.so       libcudnn_ops.so.9\nlibcudnn_adv.so.9.13.0\tlibcudnn_cnn_static.a\t  libcudnn_engines_precompiled_static_v9.a  libcudnn_graph.so\t\t\t\t   libcudnn_heuristic.so.9     libcudnn_ops.so.9.13.0\n<\/code><\/pre>\n<\/div>\n<p data-line=\"553\" class=\"code-line\">\u3064\u307e\u308a\u30019.13.0\n<\/p>\n<h4 id=\"%E8%A7%A3%E6%B1%BA%E7%AD%96\" data-line=\"555\" class=\"code-line\">\n \u89e3\u6c7a\u7b56<\/h4>\n<p data-line=\"557\" class=\"code-line\">\u89e3\u6c7a\u7b56\u306b\u306f2\u3064\u306e\u65b9\u6cd5\u304c\u3042\u308a\u307e\u3059\u3002\u30b3\u30f3\u30c6\u30ca\u5185\u90e8\u306ecuDNN\u3092\u3069\u3046\u306b\u304b\u3057\u3066update\u3059\u308b\u65b9\u6cd5\u3001\u3082\u30461\u3064\u306fcuDNN 9.13.1\u4ee5\u964d\u304c\u3059\u3067\u306b\u5165\u3063\u3066\u3044\u308b\u74b0\u5883\u3067\u30b3\u30f3\u30c6\u30ca\u3092\u4f5c\u308a\u76f4\u3059\u3053\u3068\u3067\u3059\u3002<br \/>NeMo\u306e\u4f9d\u5b58\u95a2\u4fc2\u306f\u8907\u96d1\u3067\u3042\u308b\u305f\u3081cuDNN 9.13.1\u4ee5\u4e0a\u304c\u5165\u3063\u3066\u3044\u308bNGC PyTorch\u306e\u4e0a\u304b\u3089NeMo\u306e\u4f9d\u5b58\u95a2\u4fc2\u3092\u4f5c\u308b\u3053\u3068\u306f\u6642\u9593\u3092\u8981\u3059\u308b\u3053\u3068\u304c\u4e88\u60f3\u3055\u308c\u307e\u3059\u3002<br \/>\u305d\u3053\u3067\u3001\u30b3\u30f3\u30c6\u30ca\u5185\u306ecuDNN\u306eversion\u3092\u4e0a\u3052\u308b\u9078\u629e\u3092\u691c\u8a0e\u3059\u308b\u3053\u3068\u306b\u3057\u307e\u3057\u305f\u3002<\/p>\n<p data-line=\"561\" class=\"code-line\">\u3055\u3066\u3001cuDNN\u30929.13.0\u304b\u30899.14.0(\u57f7\u7b46\u6642\u306e\u6700\u65b0)\u306b\u7f6e\u304d\u63db\u3048\u305f\u5834\u5408\u3001cuDNN\u306b\u4f9d\u5b58\u3057\u3066\u3044\u308bPyTorch\u3001TransformerEngine\u306e\u518dbuild\u306f\u5fc5\u8981\u3067\u3057\u3087\u3046\u304b\uff1f\u4eee\u306b\u5fc5\u8981\u306a\u5834\u5408\u306f\u3001\u5b9f\u8cea\u518d\u5ea6\u3001\u30b3\u30f3\u30c6\u30ca\u3092\u4f5c\u308a\u76f4\u3059\u5fc5\u8981\u304c\u3042\u308b\u306e\u3067\u3001NGC PyTorch\u304b\u3089\u4f5c\u308a\u76f4\u3059\u65b9\u304c\u5b89\u4e0a\u304c\u308a\u3067\u3057\u3087\u3046\u3002<\/p>\n<p data-line=\"563\" class=\"code-line\">\u7b54\u3048\u306f\u3001<strong>\u5426<\/strong>\u3067\u3059\u3002<br \/>\u8a73\u7d30\u306f\u4ee5\u4e0b\u306eBlog\u306b\u3066\u66f8\u3044\u3066\u3044\u307e\u3059\u304c\u3001PyTorch, TransformerEngine\u306fcuDNN\u3092\u5171\u6709\u30e9\u30a4\u30d6\u30e9\u30ea\u3067\u5229\u7528\u3057\u3066\u3044\u308b\u305f\u3081\u3001\u30b3\u30f3\u30c6\u30ca\u5185\u306e\u65e2\u5b9a\u30d1\u30b9\u306b\u65b0\u3057\u3044cuDNN\u3092bind\u3059\u308b\u3068\u30e9\u30f3\u30bf\u30a4\u30e0\u304c\u65b0\u3057\u3044cuDNN\u3092\u53c2\u7167\u3059\u308b\u3053\u3068\u306b\u306a\u308b\u306e\u3067\u3001\u518d\u5ea6build\u3059\u308b\u3053\u3068\u304c\u4e0d\u8981\u3068\u306a\u3063\u3066\u3044\u307e\u3059\u3002<\/p>\n<p data-line=\"566\" class=\"code-line\"><span class=\"embed-block zenn-embedded zenn-embedded-card\"><iframe id=\"zenn-embedded__fc230f56bb48c\" src=\"https:\/\/embed.zenn.studio\/card#zenn-embedded__fc230f56bb48c\" data-content=\"https%3A%2F%2Fzenn.dev%2Fturing_motors%2Farticles%2F3a434d046bbf48\" frameborder=\"0\" scrolling=\"no\" loading=\"lazy\"><\/iframe><\/span><a target=\"_blank\" href=\"https:\/\/zenn.dev\/turing_motors\/articles\/3a434d046bbf48\" style=\"display:none\" target=\"_blank\">https:\/\/zenn.dev\/turing_motors\/articles\/3a434d046bbf48<\/a><\/p>\n<p data-line=\"569\" class=\"code-line\">\u305d\u306e\u305f\u3081\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u3059\u308b\u3053\u3068\u3067\u5bfe\u51e6\u3067\u304d\u307e\u3059\u3002<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-bash\"><code class=\"language-bash code-line\" data-line=\"571\"><span class=\"token assign-left variable\">CUDNN_ROOT<\/span><span class=\"token operator\">=<\/span><span class=\"token string\">\"\/path\/to\/cudnn\/cudnn-linux-x86_64-9.14.0.64_cuda12-archive\"<\/span>\n\nsingularity <span class=\"token builtin class-name\">exec<\/span> <span class=\"token parameter variable\">--nv<\/span> <span class=\"token punctuation\">..<\/span><span class=\"token punctuation\">..<\/span> <span class=\"token punctuation\">\\<\/span>\n  <span class=\"token parameter variable\">--bind<\/span> <span class=\"token variable\">${CUDNN_ROOT}<\/span>\/lib:\/usr\/local\/cudnn\/lib64:ro <span class=\"token punctuation\">\\<\/span>\n  <span class=\"token parameter variable\">--bind<\/span> <span class=\"token variable\">${CUDNN_ROOT}<\/span>\/include:\/usr\/local\/cudnn\/include:ro <span class=\"token punctuation\">\\<\/span>\n  \/groups\/gch51639\/fujii\/container\/25.07.gpt_oss.fix.sif <span class=\"token punctuation\">\\<\/span>\n<\/code><\/pre>\n<\/div>\n<p data-line=\"580\" class=\"code-line\">\u4ee5\u4e0a\u306e\u5909\u66f4\u306b\u3088\u308a\u3001context length 32k, context parallel size = 4\u306b\u3066GPT-OSS\u306e\u7d99\u7d9a\u4e8b\u524d\u5b66\u7fd2(continual pre-training)\u3092\u884c\u3046\u3053\u3068\u304c\u51fa\u6765\u307e\u3057\u305f\u3002<br \/><img decoding=\"async\" src=\"https:\/\/storage.googleapis.com\/zenn-user-upload\/58f53e10c571-20251104.png\" alt=\"\" class=\"md-img\" loading=\"lazy\"\/><br \/>(\u5b66\u7fd2\u6642\u306eTraining Loss)<\/p>\n<aside class=\"msg message\"><span class=\"msg-symbol\">!<\/span><\/p>\n<div class=\"msg-content\">\n<p data-line=\"586\" class=\"code-line\">\u5909\u66f4\u3092\u884c\u3063\u305fsingularity sandbox\u3092<code>.sif<\/code>\u306b\u5909\u63db\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u304b\uff1f<\/p>\n<p data-line=\"588\" class=\"code-line\">\u306f\u3044\u3001\u305d\u306e\u901a\u308a\u3067\u3059\u3002\u4ee5\u4e0b\u306e\u30b3\u30de\u30f3\u30c9\u3067\u5909\u66f4\u3092\u884c\u3063\u305fsandbox<code>25.07.gpt_oss<\/code>\u3092<code>.sif<\/code>\u306b\u66f8\u304d\u51fa\u3057\u307e\u3057\u3087\u3046\u3002<\/p>\n<div class=\"code-block-container\">\n<pre class=\"language-bash\"><code class=\"language-bash code-line\" data-line=\"589\">singularity build <span class=\"token number\">25.07<\/span>.gpt_oss.sif <span class=\"token number\">25.07<\/span>.gpt_oss\n<\/code><\/pre>\n<\/div>\n<\/div>\n<\/aside>\n<h2 id=\"%E3%81%8A%E3%82%8F%E3%82%8A%E3%81%AB\" data-line=\"595\" class=\"code-line\">\n \u304a\u308f\u308a\u306b<\/h2>\n<p data-line=\"597\" class=\"code-line\">\u672c\u8a18\u4e8b\u3067\u306f\u3001gpt-oss\u3092NeMo\u3092\u5229\u7528\u3057\u3066\u5b66\u7fd2\u3059\u308b\u305f\u3081\u306e\u65b9\u6cd5\u306b\u3064\u3044\u3066\u89e3\u8aac\u3092\u884c\u3044\u307e\u3057\u305f\u3002<br \/>LLM\u306e\u5b66\u7fd2\u3068\u805e\u304f\u3068\u4e00\u898b\u83ef\u3084\u304b\u306b\u601d\u3048\u308b\u304b\u3082\u3057\u308c\u307e\u305b\u3093\u304c\u3001\u5b9f\u614b\u306f\u3053\u306e\u3088\u3046\u306a<strong>Software Engineering\u306e\u584a<\/strong>\u3067\u3042\u308a\u3001\u8ad6\u6587\u3067\u66f8\u304b\u308c\u3066\u3044\u308b\u3088\u3046\u306a\u3053\u3068\u3088\u308a\u3082\u5b9f\u88c5\u9762\u3067\u82e6\u52b4\u3057\u3066\u3044\u308b\u3053\u3068\u306e\u65b9\u304c\u591a\u3044\u306e\u304c\u5b9f\u614b\u3067\u3059\u3002<\/p>\n<p data-line=\"600\" class=\"code-line\"><a target=\"_blank\" href=\"https:\/\/huggingface.co\/spaces\/HuggingFaceTB\/smol-training-playbook#introduction\" target=\"_blank\" rel=\"nofollow noopener noreferrer\">huggingface smol-trainining-playbook<\/a>\u306b\u3082\u6b21\u306e\u3088\u3046\u306a\u6587\u8a00\u304c\u3042\u308b\u3088\u3046\u306b\u3001\u8ad6\u6587\u306b\u306f\u8f09\u3089\u306a\u3044\u591a\u6570\u306e\u8a66\u884c\u932f\u8aa4\u304cLLM\u3001VLM\u958b\u767a\u306e\u88cf\u3067\u306f\u884c\u308f\u308c\u3066\u3044\u307e\u3059\u3002<\/p>\n<blockquote data-line=\"601\" class=\"code-line\">\n<p data-line=\"601\" class=\"code-line\">The reality is messier, more iterative, and full of decisions that don\u2019t make it into the final paper.<\/p>\n<\/blockquote>\n<\/div>\n\n<br \/><a href=\"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63\">\u5143\u306e\u8a18\u4e8b\u3092\u78ba\u8a8d\u3059\u308b <\/a><\/p>\n","protected":false},"excerpt":{"rendered":"\u306f\u3058\u3081\u306b Turing CTO\u5ba4\u306b\u6240\u5c5e\u3057\u3066\u3044\u308b\u6771\u4eac\u79d1\u5b66\u5927\u5b66(Institute of Science Tokyo)\u306e\u85e4\u4e95\u3067\u3059\u3002\u672c\u8a18\u4e8b\u3067\u306f\u3001OpenAI\u304b\u30892025\u5e748\u6708\u306b\u30ea\u30ea\u30fc\u30b9\u3055\u308c\u305fgpt-oss\u3092NVIDIA NeM [&hellip;]","protected":false},"author":1,"featured_media":24368,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"_acf_changed":false,"footnotes":""},"categories":[4],"tags":[],"class_list":["post-24367","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-company-tec"],"acf":[],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v27.6 - https:\/\/yoast.com\/product\/yoast-seo-wordpress\/ -->\n<title>NVIDIA NeMo\u3092\u5229\u7528\u3057\u305fGPT-OSS\u306e\u5b66\u7fd2 - \u30dd\u30b1\u30b3\u30f3<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63\" \/>\n<meta property=\"og:locale\" content=\"ja_JP\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"NVIDIA NeMo\u3092\u5229\u7528\u3057\u305fGPT-OSS\u306e\u5b66\u7fd2 - \u30dd\u30b1\u30b3\u30f3\" \/>\n<meta property=\"og:description\" content=\"\u306f\u3058\u3081\u306b Turing CTO\u5ba4\u306b\u6240\u5c5e\u3057\u3066\u3044\u308b\u6771\u4eac\u79d1\u5b66\u5927\u5b66(Institute of Science Tokyo)\u306e\u85e4\u4e95\u3067\u3059\u3002\u672c\u8a18\u4e8b\u3067\u306f\u3001OpenAI\u304b\u30892025\u5e748\u6708\u306b\u30ea\u30ea\u30fc\u30b9\u3055\u308c\u305fgpt-oss\u3092NVIDIA NeM [&hellip;]\" \/>\n<meta property=\"og:url\" content=\"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63\" \/>\n<meta property=\"og:site_name\" content=\"\u30dd\u30b1\u30b3\u30f3\" \/>\n<meta property=\"article:published_time\" content=\"2025-11-19T19:44:46+00:00\" \/>\n<meta property=\"og:image\" content=\"https:\/\/pokecon.jp\/job\/wp-content\/uploads\/2025\/11\/1763581485_og-base-w1200-v2.png\" \/>\n\t<meta property=\"og:image:width\" content=\"1200\" \/>\n\t<meta property=\"og:image:height\" content=\"630\" \/>\n\t<meta property=\"og:image:type\" content=\"image\/png\" \/>\n<meta name=\"author\" content=\"info@pokecon.jp\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u57f7\u7b46\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"info@pokecon.jp\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u63a8\u5b9a\u8aad\u307f\u53d6\u308a\u6642\u9593\" \/>\n\t<meta name=\"twitter:data2\" content=\"11\u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\\\/\\\/schema.org\",\"@graph\":[{\"@type\":\"Article\",\"@id\":\"https:\\\/\\\/zenn.dev\\\/turing_motors\\\/articles\\\/81cf3128b22c63#article\",\"isPartOf\":{\"@id\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/24367\\\/\"},\"author\":{\"name\":\"info@pokecon.jp\",\"@id\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/#\\\/schema\\\/person\\\/16c9f07b1ba984d165d9aee259bda997\"},\"headline\":\"NVIDIA NeMo\u3092\u5229\u7528\u3057\u305fGPT-OSS\u306e\u5b66\u7fd2\",\"datePublished\":\"2025-11-19T19:44:46+00:00\",\"mainEntityOfPage\":{\"@id\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/24367\\\/\"},\"wordCount\":521,\"image\":{\"@id\":\"https:\\\/\\\/zenn.dev\\\/turing_motors\\\/articles\\\/81cf3128b22c63#primaryimage\"},\"thumbnailUrl\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/wp-content\\\/uploads\\\/2025\\\/11\\\/1763581485_og-base-w1200-v2.png\",\"articleSection\":[\"\u4f01\u696d\u30c6\u30c3\u30af\"],\"inLanguage\":\"ja\"},{\"@type\":\"WebPage\",\"@id\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/24367\\\/\",\"url\":\"https:\\\/\\\/zenn.dev\\\/turing_motors\\\/articles\\\/81cf3128b22c63\",\"name\":\"NVIDIA NeMo\u3092\u5229\u7528\u3057\u305fGPT-OSS\u306e\u5b66\u7fd2 - \u30dd\u30b1\u30b3\u30f3\",\"isPartOf\":{\"@id\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/#website\"},\"primaryImageOfPage\":{\"@id\":\"https:\\\/\\\/zenn.dev\\\/turing_motors\\\/articles\\\/81cf3128b22c63#primaryimage\"},\"image\":{\"@id\":\"https:\\\/\\\/zenn.dev\\\/turing_motors\\\/articles\\\/81cf3128b22c63#primaryimage\"},\"thumbnailUrl\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/wp-content\\\/uploads\\\/2025\\\/11\\\/1763581485_og-base-w1200-v2.png\",\"datePublished\":\"2025-11-19T19:44:46+00:00\",\"author\":{\"@id\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/#\\\/schema\\\/person\\\/16c9f07b1ba984d165d9aee259bda997\"},\"breadcrumb\":{\"@id\":\"https:\\\/\\\/zenn.dev\\\/turing_motors\\\/articles\\\/81cf3128b22c63#breadcrumb\"},\"inLanguage\":\"ja\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\\\/\\\/zenn.dev\\\/turing_motors\\\/articles\\\/81cf3128b22c63\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"ja\",\"@id\":\"https:\\\/\\\/zenn.dev\\\/turing_motors\\\/articles\\\/81cf3128b22c63#primaryimage\",\"url\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/wp-content\\\/uploads\\\/2025\\\/11\\\/1763581485_og-base-w1200-v2.png\",\"contentUrl\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/wp-content\\\/uploads\\\/2025\\\/11\\\/1763581485_og-base-w1200-v2.png\",\"width\":1200,\"height\":630},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\\\/\\\/zenn.dev\\\/turing_motors\\\/articles\\\/81cf3128b22c63#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u30db\u30fc\u30e0\",\"item\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"NVIDIA NeMo\u3092\u5229\u7528\u3057\u305fGPT-OSS\u306e\u5b66\u7fd2\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/#website\",\"url\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/\",\"name\":\"\u30dd\u30b1\u30b3\u30f3\",\"description\":\"\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"ja\"},{\"@type\":\"Person\",\"@id\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/#\\\/schema\\\/person\\\/16c9f07b1ba984d165d9aee259bda997\",\"name\":\"info@pokecon.jp\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"ja\",\"@id\":\"https:\\\/\\\/secure.gravatar.com\\\/avatar\\\/2b0549cd9f7907c092ca5fbb283baf72337f235726e4b46fa39ec0b701ac2fe2?s=96&d=wavatar&r=g\",\"url\":\"https:\\\/\\\/secure.gravatar.com\\\/avatar\\\/2b0549cd9f7907c092ca5fbb283baf72337f235726e4b46fa39ec0b701ac2fe2?s=96&d=wavatar&r=g\",\"contentUrl\":\"https:\\\/\\\/secure.gravatar.com\\\/avatar\\\/2b0549cd9f7907c092ca5fbb283baf72337f235726e4b46fa39ec0b701ac2fe2?s=96&d=wavatar&r=g\",\"caption\":\"info@pokecon.jp\"},\"url\":\"https:\\\/\\\/pokecon.jp\\\/job\\\/author\\\/infopokecon-jp\\\/\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"NVIDIA NeMo\u3092\u5229\u7528\u3057\u305fGPT-OSS\u306e\u5b66\u7fd2 - \u30dd\u30b1\u30b3\u30f3","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63","og_locale":"ja_JP","og_type":"article","og_title":"NVIDIA NeMo\u3092\u5229\u7528\u3057\u305fGPT-OSS\u306e\u5b66\u7fd2 - \u30dd\u30b1\u30b3\u30f3","og_description":"\u306f\u3058\u3081\u306b Turing CTO\u5ba4\u306b\u6240\u5c5e\u3057\u3066\u3044\u308b\u6771\u4eac\u79d1\u5b66\u5927\u5b66(Institute of Science Tokyo)\u306e\u85e4\u4e95\u3067\u3059\u3002\u672c\u8a18\u4e8b\u3067\u306f\u3001OpenAI\u304b\u30892025\u5e748\u6708\u306b\u30ea\u30ea\u30fc\u30b9\u3055\u308c\u305fgpt-oss\u3092NVIDIA NeM [&hellip;]","og_url":"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63","og_site_name":"\u30dd\u30b1\u30b3\u30f3","article_published_time":"2025-11-19T19:44:46+00:00","og_image":[{"width":1200,"height":630,"url":"https:\/\/pokecon.jp\/job\/wp-content\/uploads\/2025\/11\/1763581485_og-base-w1200-v2.png","type":"image\/png"}],"author":"info@pokecon.jp","twitter_card":"summary_large_image","twitter_misc":{"\u57f7\u7b46\u8005":"info@pokecon.jp","\u63a8\u5b9a\u8aad\u307f\u53d6\u308a\u6642\u9593":"11\u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"Article","@id":"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63#article","isPartOf":{"@id":"https:\/\/pokecon.jp\/job\/24367\/"},"author":{"name":"info@pokecon.jp","@id":"https:\/\/pokecon.jp\/job\/#\/schema\/person\/16c9f07b1ba984d165d9aee259bda997"},"headline":"NVIDIA NeMo\u3092\u5229\u7528\u3057\u305fGPT-OSS\u306e\u5b66\u7fd2","datePublished":"2025-11-19T19:44:46+00:00","mainEntityOfPage":{"@id":"https:\/\/pokecon.jp\/job\/24367\/"},"wordCount":521,"image":{"@id":"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63#primaryimage"},"thumbnailUrl":"https:\/\/pokecon.jp\/job\/wp-content\/uploads\/2025\/11\/1763581485_og-base-w1200-v2.png","articleSection":["\u4f01\u696d\u30c6\u30c3\u30af"],"inLanguage":"ja"},{"@type":"WebPage","@id":"https:\/\/pokecon.jp\/job\/24367\/","url":"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63","name":"NVIDIA NeMo\u3092\u5229\u7528\u3057\u305fGPT-OSS\u306e\u5b66\u7fd2 - \u30dd\u30b1\u30b3\u30f3","isPartOf":{"@id":"https:\/\/pokecon.jp\/job\/#website"},"primaryImageOfPage":{"@id":"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63#primaryimage"},"image":{"@id":"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63#primaryimage"},"thumbnailUrl":"https:\/\/pokecon.jp\/job\/wp-content\/uploads\/2025\/11\/1763581485_og-base-w1200-v2.png","datePublished":"2025-11-19T19:44:46+00:00","author":{"@id":"https:\/\/pokecon.jp\/job\/#\/schema\/person\/16c9f07b1ba984d165d9aee259bda997"},"breadcrumb":{"@id":"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63#breadcrumb"},"inLanguage":"ja","potentialAction":[{"@type":"ReadAction","target":["https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63"]}]},{"@type":"ImageObject","inLanguage":"ja","@id":"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63#primaryimage","url":"https:\/\/pokecon.jp\/job\/wp-content\/uploads\/2025\/11\/1763581485_og-base-w1200-v2.png","contentUrl":"https:\/\/pokecon.jp\/job\/wp-content\/uploads\/2025\/11\/1763581485_og-base-w1200-v2.png","width":1200,"height":630},{"@type":"BreadcrumbList","@id":"https:\/\/zenn.dev\/turing_motors\/articles\/81cf3128b22c63#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u30db\u30fc\u30e0","item":"https:\/\/pokecon.jp\/job\/"},{"@type":"ListItem","position":2,"name":"NVIDIA NeMo\u3092\u5229\u7528\u3057\u305fGPT-OSS\u306e\u5b66\u7fd2"}]},{"@type":"WebSite","@id":"https:\/\/pokecon.jp\/job\/#website","url":"https:\/\/pokecon.jp\/job\/","name":"\u30dd\u30b1\u30b3\u30f3","description":"","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/pokecon.jp\/job\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"ja"},{"@type":"Person","@id":"https:\/\/pokecon.jp\/job\/#\/schema\/person\/16c9f07b1ba984d165d9aee259bda997","name":"info@pokecon.jp","image":{"@type":"ImageObject","inLanguage":"ja","@id":"https:\/\/secure.gravatar.com\/avatar\/2b0549cd9f7907c092ca5fbb283baf72337f235726e4b46fa39ec0b701ac2fe2?s=96&d=wavatar&r=g","url":"https:\/\/secure.gravatar.com\/avatar\/2b0549cd9f7907c092ca5fbb283baf72337f235726e4b46fa39ec0b701ac2fe2?s=96&d=wavatar&r=g","contentUrl":"https:\/\/secure.gravatar.com\/avatar\/2b0549cd9f7907c092ca5fbb283baf72337f235726e4b46fa39ec0b701ac2fe2?s=96&d=wavatar&r=g","caption":"info@pokecon.jp"},"url":"https:\/\/pokecon.jp\/job\/author\/infopokecon-jp\/"}]}},"_links":{"self":[{"href":"https:\/\/pokecon.jp\/job\/wp-json\/wp\/v2\/posts\/24367","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/pokecon.jp\/job\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/pokecon.jp\/job\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/pokecon.jp\/job\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/pokecon.jp\/job\/wp-json\/wp\/v2\/comments?post=24367"}],"version-history":[{"count":1,"href":"https:\/\/pokecon.jp\/job\/wp-json\/wp\/v2\/posts\/24367\/revisions"}],"predecessor-version":[{"id":24369,"href":"https:\/\/pokecon.jp\/job\/wp-json\/wp\/v2\/posts\/24367\/revisions\/24369"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/pokecon.jp\/job\/wp-json\/wp\/v2\/media\/24368"}],"wp:attachment":[{"href":"https:\/\/pokecon.jp\/job\/wp-json\/wp\/v2\/media?parent=24367"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/pokecon.jp\/job\/wp-json\/wp\/v2\/categories?post=24367"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/pokecon.jp\/job\/wp-json\/wp\/v2\/tags?post=24367"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}