1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
|
https://mail.kde.org/pipermail/distributions/2024-July/001511.html
https://github.com/xtensor-stack/xsimd/commit/96edf0340492fa9c080f5182b38358ca85baef5e
From 96edf0340492fa9c080f5182b38358ca85baef5e Mon Sep 17 00:00:00 2001
From: Dmitry Kazakov <dimula73@gmail.com>
Date: Tue, 28 May 2024 22:21:08 +0200
Subject: [PATCH] Fix detection of SSE/AVX/AVX512 when they are explicitly
disabled by OS
Some CPU vulnerability mitigations may disable AVX functionality
on the hardware level via the XCR0 register. We should check that
manually to verify that OS actually allows us to use this feature.
See https://bugs.kde.org/show_bug.cgi?id=484622
Fix #1025
---
include/xsimd/config/xsimd_cpuid.hpp | 91 ++++++++++++++++++++++------
1 file changed, 72 insertions(+), 19 deletions(-)
diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp
index f22089bac..6dda3be09 100644
--- a/include/xsimd/config/xsimd_cpuid.hpp
+++ b/include/xsimd/config/xsimd_cpuid.hpp
@@ -114,6 +114,35 @@ namespace xsimd
#endif
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
+
+ auto get_xcr0_low = []() noexcept
+ {
+ uint32_t xcr0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+
+ xcr0 = (uint32_t)_xgetbv(0);
+
+#elif defined(__GNUC__)
+
+ __asm__(
+ "xorl %%ecx, %%ecx\n"
+ "xgetbv\n"
+ : "=a"(xcr0)
+ :
+#if defined(__i386__)
+ : "ecx", "edx"
+#else
+ : "rcx", "rdx"
+#endif
+ );
+
+#else /* _MSC_VER < 1400 */
+#error "_MSC_VER < 1400 is not supported"
+#endif /* _MSC_VER && _MSC_VER >= 1400 */
+ return xcr0;
+ };
+
auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept
{
@@ -148,19 +177,43 @@ namespace xsimd
get_cpuid(regs1, 0x1);
- sse2 = regs1[3] >> 26 & 1;
- sse3 = regs1[2] >> 0 & 1;
- ssse3 = regs1[2] >> 9 & 1;
- sse4_1 = regs1[2] >> 19 & 1;
- sse4_2 = regs1[2] >> 20 & 1;
- fma3_sse42 = regs1[2] >> 12 & 1;
+ // OS can explicitly disable the usage of SSE/AVX extensions
+ // by setting an appropriate flag in CR0 register
+ //
+ // https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html
+
+ unsigned sse_state_os_enabled = 1;
+ unsigned avx_state_os_enabled = 1;
+ unsigned avx512_state_os_enabled = 1;
+
+ // OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit
+ // 18] to enable XSETBV/XGETBV instructions to access XCR0 and
+ // to support processor extended state management using
+ // XSAVE/XRSTOR.
+ bool osxsave = regs1[2] >> 27 & 1;
+ if (osxsave)
+ {
+
+ uint32_t xcr0 = get_xcr0_low();
+
+ sse_state_os_enabled = xcr0 >> 1 & 1;
+ avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled;
+ avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled;
+ }
+
+ sse2 = regs1[3] >> 26 & sse_state_os_enabled;
+ sse3 = regs1[2] >> 0 & sse_state_os_enabled;
+ ssse3 = regs1[2] >> 9 & sse_state_os_enabled;
+ sse4_1 = regs1[2] >> 19 & sse_state_os_enabled;
+ sse4_2 = regs1[2] >> 20 & sse_state_os_enabled;
+ fma3_sse42 = regs1[2] >> 12 & sse_state_os_enabled;
- avx = regs1[2] >> 28 & 1;
+ avx = regs1[2] >> 28 & avx_state_os_enabled;
fma3_avx = avx && fma3_sse42;
int regs8[4];
get_cpuid(regs8, 0x80000001);
- fma4 = regs8[2] >> 16 & 1;
+ fma4 = regs8[2] >> 16 & avx_state_os_enabled;
// sse4a = regs[2] >> 6 & 1;
@@ -168,23 +221,23 @@ namespace xsimd
int regs7[4];
get_cpuid(regs7, 0x7);
- avx2 = regs7[1] >> 5 & 1;
+ avx2 = regs7[1] >> 5 & avx_state_os_enabled;
int regs7a[4];
get_cpuid(regs7a, 0x7, 0x1);
- avxvnni = regs7a[0] >> 4 & 1;
+ avxvnni = regs7a[0] >> 4 & avx_state_os_enabled;
fma3_avx2 = avx2 && fma3_sse42;
- avx512f = regs7[1] >> 16 & 1;
- avx512cd = regs7[1] >> 28 & 1;
- avx512dq = regs7[1] >> 17 & 1;
- avx512bw = regs7[1] >> 30 & 1;
- avx512er = regs7[1] >> 27 & 1;
- avx512pf = regs7[1] >> 26 & 1;
- avx512ifma = regs7[1] >> 21 & 1;
- avx512vbmi = regs7[2] >> 1 & 1;
- avx512vnni_bw = regs7[2] >> 11 & 1;
+ avx512f = regs7[1] >> 16 & avx512_state_os_enabled;
+ avx512cd = regs7[1] >> 28 & avx512_state_os_enabled;
+ avx512dq = regs7[1] >> 17 & avx512_state_os_enabled;
+ avx512bw = regs7[1] >> 30 & avx512_state_os_enabled;
+ avx512er = regs7[1] >> 27 & avx512_state_os_enabled;
+ avx512pf = regs7[1] >> 26 & avx512_state_os_enabled;
+ avx512ifma = regs7[1] >> 21 & avx512_state_os_enabled;
+ avx512vbmi = regs7[2] >> 1 & avx512_state_os_enabled;
+ avx512vnni_bw = regs7[2] >> 11 & avx512_state_os_enabled;
avx512vnni_vbmi = avx512vbmi && avx512vnni_bw;
#endif
}
|