ROCm · richagadgil · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 11, 2024
@@ -89,7 +89,6 @@ add_library(migraphx
     propagate_constant.cpp
     promote_literals.cpp
     quantization.cpp
-    quantize_fp16.cpp
     quantize_int4.cpp
     quantize_8bits.cpp
     reduce_dims.cpp
@@ -115,6 +114,7 @@ add_library(migraphx
     split_single_dyn_dim.cpp
     target.cpp
     tmp_dir.cpp
+    truncate_float.cpp
     value.cpp
     verify_args.cpp
 )

@@ -234,6 +234,16 @@ void quantize_fp16_with_op_names(program& prog, std::vector<std::string>& names)
     migraphx::quantize_fp16(prog, names);
 }
 
+void quantize_bf16_with_op_names(program& prog, std::vector<std::string>& names)
+{
+    if(names.empty())
+    {
+        names = {"all"};
+    }
+
+    migraphx::quantize_bf16(prog, names);
+}
+
 struct quantize_int8_options
 {
     std::vector<parameter_map> calibration   = {};
@@ -2199,6 +2209,29 @@ extern "C" migraphx_status migraphx_quantize_fp16(migraphx_program_t prog)
     return api_error_result;
 }
 
+extern "C" migraphx_status migraphx_quantize_bf16_with_op_names(migraphx_program_t prog,
+                                                                migraphx_quantize_op_names_t name)
+{
+    auto api_error_result = migraphx::try_([&] {
+        if(prog == nullptr)
+            MIGRAPHX_THROW(migraphx_status_bad_param, "Bad parameter prog: Null pointer");
+        if(name == nullptr)
+            MIGRAPHX_THROW(migraphx_status_bad_param, "Bad parameter name: Null pointer");
+        migraphx::quantize_bf16_with_op_names((prog->object), (name->object));
+    });
+    return api_error_result;
+}
+
+extern "C" migraphx_status migraphx_quantize_bf16(migraphx_program_t prog)
+{
+    auto api_error_result = migraphx::try_([&] {
+        if(prog == nullptr)
+            MIGRAPHX_THROW(migraphx_status_bad_param, "Bad parameter prog: Null pointer");
+        migraphx::quantize_bf16((prog->object));
+    });
+    return api_error_result;
+}
+
 extern "C" migraphx_status
 migraphx_quantize_int8_options_destroy(migraphx_quantize_int8_options_t quantize_int8_options)
 {

@@ -35,6 +35,7 @@
 #define MIGRAPHX_SHAPE_VISIT_TYPES(m) \
     m(bool_type, bool) \
     m(half_type, half) \
+    m(bf16_type, bf16) \
     m(float_type, float) \
     m(double_type, double) \
     m(uint8_type, uint8_t) \
@@ -602,6 +603,11 @@ migraphx_quantize_fp16_with_op_names(migraphx_program_t prog, migraphx_quantize_
 
 MIGRAPHX_C_EXPORT migraphx_status migraphx_quantize_fp16(migraphx_program_t prog);
 
+MIGRAPHX_C_EXPORT migraphx_status
+migraphx_quantize_bf16_with_op_names(migraphx_program_t prog, migraphx_quantize_op_names_t name);
+
+MIGRAPHX_C_EXPORT migraphx_status migraphx_quantize_bf16(migraphx_program_t prog);
+
 MIGRAPHX_C_EXPORT migraphx_status
 migraphx_quantize_int8_options_destroy(migraphx_quantize_int8_options_t quantize_int8_options);
 

@@ -1484,6 +1484,18 @@ inline void quantize_fp16(const program& prog)
     call(&migraphx_quantize_fp16, prog.get_handle_ptr());
 }
 
+/// Quantize program to use bf16
+inline void quantize_bf16(const program& prog, const quantize_op_names& names)
+{
+    call(&migraphx_quantize_bf16_with_op_names, prog.get_handle_ptr(), names.get_handle_ptr());
+}
+
+/// Quantize program to use bf16
+inline void quantize_bf16(const program& prog)
+{
+    call(&migraphx_quantize_bf16, prog.get_handle_ptr());
+}
+
 /// Options to be passed when quantizing for int8
 struct quantize_int8_options : MIGRAPHX_HANDLE_BASE(quantize_int8_options)
 {

@@ -443,6 +443,14 @@ def quantize_op_names(h):
 api.add_function('migraphx_quantize_fp16',
                  api.params(prog='migraphx::program&'),
                  fname='migraphx::quantize_fp16')
+api.add_function('migraphx_quantize_bf16_with_op_names',
+                 api.params(prog='migraphx::program&',
+                            name='std::vector<std::string>&'),
+                 fname='migraphx::quantize_bf16_with_op_names')
+
+api.add_function('migraphx_quantize_bf16',
+                 api.params(prog='migraphx::program&'),
+                 fname='migraphx::quantize_bf16')
 
 
 @auto_handle()

@@ -482,6 +482,7 @@ struct compiler
     compiler_target ct;
     compile_options co;
     bool to_fp16 = false;
+    bool to_bf16 = false;
     bool to_fp8  = false;
     bool to_int8 = false;
     bool to_int4 = false;
@@ -506,6 +507,7 @@ struct compiler
            ap.help("Exhastively search for best tuning parameters for kernels"),
            ap.set_value(true));
         ap(to_fp16, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(true));
+        ap(to_bf16, {"--bf16"}, ap.help("Quantize for bf16"), ap.set_value(true));
         ap(to_int8, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(true));
         ap(to_fp8, {"--fp8"}, ap.help("Quantize for fp8"), ap.set_value(true));
         ap(to_int4, {"--int4-weights"}, ap.help("Quantize weights for int4"), ap.set_value(true));
@@ -555,6 +557,10 @@ struct compiler
         {
             quantize_fp16(p);
         }
+        if(to_bf16)
+        {
+            quantize_bf16(p);
+        }
         if(to_int8)
         {
             quantize_int8(p, t, {host_params(p)});

@@ -0,0 +1,105 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_BF16_HPP
+#define MIGRAPHX_GUARD_RTGLIB_BF16_HPP
+
+#include <migraphx/half.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/float8.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+using bf16 = migraphx::generic_float<7, 8>;
+
+// template <class T>
+// using deduce = typename detail::deduce<T>::type;
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+namespace std {
+
+template <class T>
+struct common_type<migraphx::bf16, T> : std::common_type<float, T> // NOLINT
+{
+};
+
+template <class T>
+struct common_type<T, migraphx::bf16> : std::common_type<float, T> // NOLINT
+{
+};
+
+template <>
+struct common_type<migraphx::fp8::fp8e4m3fnuz, migraphx::bf16>
+{
+    using type = float;
+};
+
+template <>
+struct common_type<migraphx::bf16, migraphx::fp8::fp8e4m3fnuz>
+{
+    using type = float;
+};
+
+template <>
+struct common_type<migraphx::fp8::fp8e4m3fn, migraphx::bf16>
+{
+    using type = float;
+};
+
+template <>
+struct common_type<migraphx::bf16, migraphx::fp8::fp8e4m3fn>
+{
+    using type = float;
+};
+
+template <>
+struct common_type<migraphx::fp8::fp8e5m2, migraphx::bf16>
+{
+    using type = float;
+};
+
+template <>
+struct common_type<migraphx::bf16, migraphx::fp8::fp8e5m2>
+{
+    using type = float;
+};
+
+template <>
+struct common_type<migraphx::bf16, migraphx::bf16>
+{
+    using type = migraphx::bf16;
+};
+
+template <>
+struct common_type<migraphx::bf16, migraphx::generic_float<10, 5>>
+{
+    using type = float;
+};
+
+} // namespace std
+
+#endif
@@ -102,6 +102,12 @@ struct common_type<migraphx::half, migraphx::half>
     using type = migraphx::half;
 };
 
+template <>
+struct common_type<migraphx::half, migraphx::generic_float<7, 8>>
+{
+    using type = float;
+};
+
 } // namespace std
 
 #endif
@@ -41,6 +41,9 @@ struct program;
 MIGRAPHX_EXPORT void quantize_fp16(program& prog,
                                    const std::vector<std::string>& ins_names = {"all"});
 
+MIGRAPHX_EXPORT void quantize_bf16(program& prog,
+                                   const std::vector<std::string>& ins_names = {"all"});
+
 MIGRAPHX_EXPORT void quantize_int8(program& prog,
                                    const target& t,
                                    const std::vector<parameter_map>& calibration,

@@ -34,6 +34,7 @@
 #include <migraphx/functional.hpp>
 #include <migraphx/errors.hpp>
 #include <migraphx/half.hpp>
+#include <migraphx/bf16.hpp>
 #include <migraphx/float8.hpp>
 #include <migraphx/serialize.hpp>
 #include <migraphx/config.hpp>
@@ -52,6 +53,7 @@ struct MIGRAPHX_EXPORT shape
 #define MIGRAPHX_SHAPE_VISIT_TYPES(m) \
     m(bool_type, bool) \
     m(half_type, half) \
+    m(bf16_type, bf16) \
     m(float_type, float) \
     m(double_type, double) \
     m(uint8_type, uint8_t) \
@@ -65,7 +67,7 @@ struct MIGRAPHX_EXPORT shape
     m(fp8e4m3fnuz_type, migraphx::fp8::fp8e4m3fnuz) \
     m(fp8e4m3fn_type, migraphx::fp8::fp8e4m3fn) \
     m(fp8e5m2_type, migraphx::fp8::fp8e5m2)
-// clang-format on
+    // clang-format on
 
 #define MIGRAPHX_SHAPE_GENERATE_ENUM_TYPES(x, t) x,
     enum type_t

@@ -21,12 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-#ifndef MIGRAPHX_GUARD_RTGLIB_QUANTIZE_FP16_HPP
-#define MIGRAPHX_GUARD_RTGLIB_QUANTIZE_FP16_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_TRUNCATE_FLOAT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_TRUNCATE_FLOAT_HPP
 
 #include <string>
 #include <vector>
 #include <migraphx/config.hpp>
+#include <migraphx/shape.hpp>
 
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -35,12 +36,13 @@ struct program;
 struct module;
 
 /**
- * quantize a program to fp16
+ * quantize a program to bf16
  */
-struct MIGRAPHX_EXPORT quantize_fp16_pass
+struct MIGRAPHX_EXPORT truncate_float_pass
 {
     std::vector<std::string> ins_names = {"all"};
-    std::string name() const { return "quantize_fp16"; }
+    shape::type_t float_type;
+    std::string name() const { return "truncate_float"; }
     void apply(module& m) const;
 };
 

@@ -27,6 +27,7 @@
 
 #include <type_traits>
 #include <migraphx/half.hpp>
+#include <migraphx/bf16.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/float8.hpp>
 
@@ -53,6 +54,10 @@ MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_floating_point, half)
 MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_signed, half)
 MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_arithmetic, half)
 
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_floating_point, bf16)
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_signed, bf16)
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_arithmetic, bf16)
+
 MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_floating_point, migraphx::fp8::fp8e4m3fnuz)
 MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_signed, migraphx::fp8::fp8e4m3fnuz)
 MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_arithmetic, migraphx::fp8::fp8e4m3fnuz)

@@ -610,6 +610,10 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
           &migraphx::quantize_fp16,
           py::arg("prog"),
           py::arg("ins_names") = std::vector<std::string>{"all"});
+    m.def("quantize_bf16",
+          &migraphx::quantize_bf16,
+          py::arg("prog"),
+          py::arg("ins_names") = std::vector<std::string>{"all"});
     m.def("quantize_int8",
           &migraphx::quantize_int8,
           py::arg("prog"),

@@ -24,7 +24,7 @@
 #include <migraphx/float_equal.hpp>
 #include <migraphx/instruction_ref.hpp>
 #include <migraphx/quantization.hpp>
-#include <migraphx/quantize_fp16.hpp>
+#include <migraphx/truncate_float.hpp>
 #include <migraphx/quantize_8bits.hpp>
 #include <migraphx/quantize_int4.hpp>
 #include <migraphx/simplify_reshapes.hpp>
@@ -69,7 +69,17 @@ void quantize_fp16(program& prog, const std::vector<std::string>& ins_names)
     run_passes(prog,
                {normalize_ops{},
                 optimize_module{{"quantizelinear", "dequantizelinear"}},
-                quantize_fp16_pass{ins_names},
+                truncate_float_pass{ins_names, shape::half_type},
+                optimize_module{{"quantizelinear", "dequantizelinear"}}},
+               quant_tracer());
+}
+
+void quantize_bf16(program& prog, const std::vector<std::string>& ins_names)
+{
+    run_passes(prog,
+               {normalize_ops{},
+                optimize_module{{"quantizelinear", "dequantizelinear"}},
+                truncate_float_pass{ins_names, shape::bf16_type},
                 optimize_module{{"quantizelinear", "dequantizelinear"}}},
                quant_tracer());
 }

@@ -59,6 +59,7 @@ rocblas_datatype get_type(shape::type_t type)
     case shape::double_type: return rocblas_datatype_f64_r;
     case shape::float_type: return rocblas_datatype_f32_r;
     case shape::half_type: return rocblas_datatype_f16_r;
+    case shape::bf16_type: return rocblas_datatype_bf16_r;
     case shape::int8_type: return rocblas_datatype_i8_r;
     case shape::uint8_type: return rocblas_datatype_u8_r;
     case shape::int32_type: return rocblas_datatype_i32_r;