50 lines
2.3 KiB
SYSTEMD
50 lines
2.3 KiB
SYSTEMD
diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
|
|
index 7f05c2ad166..1632b595c4c 100644
|
|
--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
|
|
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
|
|
@@ -220,8 +220,12 @@ class Vectorized<BFloat16> {
|
|
Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
|
|
};
|
|
|
|
-inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
|
|
- const Vectorized<c10::BFloat16>& a) {
|
|
+#if defined(__GNUC__) && __GNUC__ == 14
|
|
+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
|
|
+__attribute__((optimize("no-tree-vectorize")))
|
|
+#endif
|
|
+inline std::tuple<Vectorized<float>, Vectorized<float>>
|
|
+convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
|
|
static_assert(
|
|
Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
|
|
auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
|
|
diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
|
|
index 52d5383e60f..00c9f4eb253 100644
|
|
--- a/aten/src/ATen/native/cpu/Activation.cpp
|
|
+++ b/aten/src/ATen/native/cpu/Activation.cpp
|
|
@@ -26,6 +26,10 @@ namespace at::native {
|
|
|
|
namespace {
|
|
|
|
+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
|
|
+// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON
|
|
+__attribute__((optimize("no-tree-vectorize")))
|
|
+#endif
|
|
static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
|
|
if (at::isReducedFloatingType(input.scalar_type())) {
|
|
AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() {
|
|
diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
|
|
index 8ef0741e77a..8c94decfff0 100644
|
|
--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
|
|
+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
|
|
@@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last(
|
|
|
|
/* note: due to write issues, this one cannot be parallelized as well as
|
|
* unfolded2d_copy */
|
|
+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16)
|
|
+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16
|
|
+__attribute__((optimize("no-tree-vectorize")))
|
|
+#endif
|
|
void unfolded2d_acc_kernel(
|
|
ScalarType dtype,
|
|
void *finput_data,
|