blob: 193489781bf063870637c4d6136eadd79a717d05 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
|
# Copyright 1999-2025 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
EAPI=8
ROCM_VERSION=${PV}
PYTHON_COMPAT=( python3_{10..14} python3_13t )
inherit check-reqs cmake flag-o-matic multiprocessing python-r1 rocm
GTEST_COMMIT="b85864c64758dec007208e56af933fc3f52044ee"
GTEST_FILE="gtest-1.14.0_p20220421.tar.gz"
DESCRIPTION="High Performance Composable Kernel for AMD GPUs"
HOMEPAGE="https://github.com/ROCm/composable_kernel"
SRC_URI="https://github.com/ROCm/composable_kernel/archive/rocm-${PV}.tar.gz -> ${P}.tar.gz
test? ( https://github.com/google/googletest/archive/${GTEST_COMMIT}.tar.gz -> ${GTEST_FILE} )"
S="${WORKDIR}/composable_kernel-rocm-${PV}"
LICENSE="MIT"
SLOT="0/$(ver_cut 1-2)"
KEYWORDS="~amd64"
IUSE="debug profiler test"
REQUIRED_USE="${ROCM_REQUIRED_USE} ${PYTHON_REQUIRED_USE}"
RESTRICT="!test? ( test )"
RDEPEND="
dev-util/hip:${SLOT}
${PYTHON_DEPS}
"
DEPEND="${RDEPEND}"
BDEPEND="
dev-build/rocm-cmake
"
PATCHES=(
"${FILESDIR}"/${PN}-6.1.1-no-git-no-hash.patch
"${FILESDIR}"/${PN}-6.3.0-conditional-kernels.patch
"${FILESDIR}"/${PN}-7.0.1-conditional-ckprofiler.patch
"${FILESDIR}"/${PN}-6.4.1-expand-isa.patch
"${FILESDIR}"/${PN}-7.0.1-libcxx-includes.patch
)
ck_check-reqs() {
[[ ${MERGE_TYPE} == binary ]] && return
targets=($AMDGPU_TARGETS)
if [[ ${#targets[@]} -gt 1 ]]; then
ewarn "composable-kernel will be compiled for multiple GPU architectures,"
ewarn "which will take a significant amount of time."
ewarn "Please consider setting AMDGPU_TARGETS USE_EXPAND variable to a single architecture."
fi
# It takes ~2Gb of RAM per build thread
local user_jobs=$(makeopts_jobs)
local available_memory_mb=$(free -m | awk '/Mem:/ {print $7}')
local max_jobs=$(( available_memory_mb / 2048 ))
max_jobs=$(( max_jobs < 1 ? 1 : max_jobs ))
local limited_jobs=$(( user_jobs < max_jobs ? user_jobs : max_jobs ))
if [[ "${max_jobs}" -lt "${user_jobs}" ]]; then
ewarn "${available_memory_mb} MB of free RAM is not enough for ${user_jobs} parallel build jobs (~2Gb per job)."
ewarn "Please consider setting MAKEOPTS=\"-j${limited_jobs}\" for this package."
fi
local CHECKREQS_MEMORY=$((user_jobs*2048))M
check-reqs_${EBUILD_PHASE_FUNC}
}
pkg_pretend() {
ck_check-reqs
}
pkg_setup() {
ck_check-reqs
}
src_prepare() {
sed -e '/-Werror/d' -i cmake/EnableCompilerWarnings.cmake || die
# don't build examples
sed -e "/add_subdirectory(example)/d" -i CMakeLists.txt || die
# Flag -amdgpu-early-inline-all explodes memory consumption
# https://github.com/llvm/llvm-project/issues/86332
sed -e "/-amdgpu-early-inline-all/d" -e "/-amdgpu-function-calls/d" -i CMakeLists.txt || die
cmake_src_prepare
}
src_configure() {
rocm_use_hipcc
if ! use debug; then
append-cflags "-DNDEBUG"
append-cxxflags "-DNDEBUG"
CMAKE_BUILD_TYPE="Release"
else
CMAKE_BUILD_TYPE="Debug"
fi
local mycmakeargs=(
-DCMAKE_SKIP_RPATH=ON
-DBUILD_DEV=OFF
-DGPU_TARGETS="$(get_amdgpu_flags)"
-DCMAKE_INSTALL_PREFIX="${EPREFIX}/usr"
-DBUILD_TESTING=$(usex test ON OFF)
-DCK_USE_PROFILER=$(usex profiler ON OFF)
-Wno-dev
)
# Since 6.4.1 "fallback" DL kernels should be enabled manually...
if use amdgpu_targets_gfx1010 || use amdgpu_targets_gfx1011 || use amdgpu_targets_gfx1012 \
|| use amdgpu_targets_gfx1030 || use amdgpu_targets_gfx1031 ; then
mycmakeargs+=(-DDL_KERNELS=ON)
fi
if use test; then
mycmakeargs+=(
-DFETCHCONTENT_SOURCE_DIR_GTEST="${WORKDIR}/googletest-${GTEST_COMMIT}"
)
fi
# rocminfo call during configuration; should not happen
# Bug: https://github.com/ROCm/composable_kernel/issues/2994
rocm_add_sandbox -w
addpredict /dev/random
cmake_src_configure
}
src_install() {
cmake_src_install
# shellcheck disable=SC2329
installation() {
python_domodule python/ck4inductor
# install package-data manually, as there is no PEP517 compliance
shopt -s globstar
package_data=(
include/ck/**/*.hpp
library/src/tensor_operation_instance/gpu/gemm_universal/**/*.hpp
)
shopt -u globstar
inst_path="${D}$(python_get_sitedir)/ck4inductor"
for file in "${package_data[@]}"; do
location="${inst_path}/$(dirname "$file")"
mkdir -p "${location}"
cp "${file}" "${location}"
done
}
python_foreach_impl installation
}
src_test() {
check_amdgpu
LD_LIBRARY_PATH="${BUILD_DIR}"/lib cmake_src_test -j1
}
|