summaryrefslogtreecommitdiff
path: root/sci-libs/composable-kernel/composable-kernel-7.1.0.ebuild
blob: e2470251dd61f40bfdc1e92edc566e09a25b1124 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Copyright 1999-2025 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

EAPI=8

ROCM_VERSION=${PV}
PYTHON_COMPAT=( python3_{10..14} python3_13t )

inherit check-reqs cmake flag-o-matic multiprocessing python-r1 rocm

GTEST_COMMIT="b85864c64758dec007208e56af933fc3f52044ee"
GTEST_FILE="gtest-1.14.0_p20220421.tar.gz"

DESCRIPTION="High Performance Composable Kernel for AMD GPUs"
HOMEPAGE="https://github.com/ROCm/composable_kernel"
SRC_URI="https://github.com/ROCm/composable_kernel/archive/rocm-${PV}.tar.gz -> ${P}.tar.gz
	test? ( https://github.com/google/googletest/archive/${GTEST_COMMIT}.tar.gz -> ${GTEST_FILE} )"
S="${WORKDIR}/composable_kernel-rocm-${PV}"

LICENSE="MIT"
SLOT="0/$(ver_cut 1-2)"
KEYWORDS="~amd64"

IUSE="debug profiler test"
REQUIRED_USE="${ROCM_REQUIRED_USE} ${PYTHON_REQUIRED_USE}"
RESTRICT="!test? ( test )"

RDEPEND="
	dev-util/hip:${SLOT}
	${PYTHON_DEPS}
"

DEPEND="${RDEPEND}"

BDEPEND="
	dev-build/rocm-cmake
"

PATCHES=(
	"${FILESDIR}"/${PN}-6.1.1-no-git-no-hash.patch
	"${FILESDIR}"/${PN}-6.3.0-conditional-kernels.patch
	"${FILESDIR}"/${PN}-7.0.1-conditional-ckprofiler.patch
	"${FILESDIR}"/${PN}-7.0.1-libcxx-includes.patch
	"${FILESDIR}"/${PN}-7.1.0-expand-isa.patch
)

ck_check-reqs() {
	[[ ${MERGE_TYPE} == binary ]] && return

	targets=($AMDGPU_TARGETS)
	if [[ ${#targets[@]} -gt 1 ]]; then
		ewarn "composable-kernel will be compiled for multiple GPU architectures,"
		ewarn "which will take a significant amount of time."
		ewarn "Please consider setting AMDGPU_TARGETS USE_EXPAND variable to a single architecture."
	fi

	# It takes ~3GB of RAM per build thread
	local user_jobs=$(makeopts_jobs)
	local available_memory_mb=$(free -m | awk '/Mem:/ {print $7}')
	local max_jobs=$(( available_memory_mb / 2048 ))
	max_jobs=$(( max_jobs < 1 ? 1 : max_jobs ))
	local limited_jobs=$(( user_jobs < max_jobs ? user_jobs : max_jobs ))
	if [[ "${max_jobs}" -lt "${user_jobs}" ]]; then
		ewarn "${available_memory_mb} MB of free RAM is not enough for ${user_jobs} parallel build jobs (~2Gb per job)."
		ewarn "Please consider setting MAKEOPTS=\"-j${limited_jobs}\" for this package."
	fi

	local CHECKREQS_MEMORY=$((user_jobs*3072))M
	check-reqs_${EBUILD_PHASE_FUNC}
}

pkg_pretend() {
	ck_check-reqs
}

pkg_setup() {
	ck_check-reqs
}

src_prepare() {
	sed -e '/-Werror/d' -i cmake/EnableCompilerWarnings.cmake || die

	# don't build examples
	sed -e "/add_subdirectory(example)/d" -i CMakeLists.txt || die

	# Flag -amdgpu-early-inline-all explodes memory consumption
	# https://github.com/llvm/llvm-project/issues/86332
	sed -e "/-amdgpu-early-inline-all/d" -e "/-amdgpu-function-calls/d" -i CMakeLists.txt || die

	cmake_src_prepare
}

src_configure() {
	rocm_use_hipcc

	if ! use debug; then
		append-cflags "-DNDEBUG"
		append-cxxflags "-DNDEBUG"
		CMAKE_BUILD_TYPE="Release"
	else
		CMAKE_BUILD_TYPE="Debug"
	fi

	local mycmakeargs=(
		-DCMAKE_SKIP_RPATH=ON
		-DBUILD_DEV=OFF
		-DGPU_TARGETS="$(get_amdgpu_flags)"
		-DCMAKE_INSTALL_PREFIX="${EPREFIX}/usr"
		-DBUILD_TESTING=$(usex test ON OFF)
		-DCK_USE_PROFILER=$(usex profiler ON OFF)
		-Wno-dev
	)

	# Since 6.4.1 "fallback" DL kernels should be enabled manually...
	if use amdgpu_targets_gfx1010 || use amdgpu_targets_gfx1011 || use amdgpu_targets_gfx1012 \
	|| use amdgpu_targets_gfx1030 || use amdgpu_targets_gfx1031 ; then
		mycmakeargs+=(-DDL_KERNELS=ON)
	fi

	if use test; then
		mycmakeargs+=(
			-DFETCHCONTENT_SOURCE_DIR_GTEST="${WORKDIR}/googletest-${GTEST_COMMIT}"
		)
	fi

	# rocminfo call during configuration; should not happen
	# Bug: https://github.com/ROCm/composable_kernel/issues/2994
	rocm_add_sandbox -w
	addpredict /dev/random

	cmake_src_configure
}

src_install() {
	cmake_src_install

	# shellcheck disable=SC2329
	installation() {
		python_domodule python/ck4inductor

		# install package-data manually, as there is no PEP517 compliance
		shopt -s globstar
		package_data=(
			include/ck/**/*.hpp
			library/src/tensor_operation_instance/gpu/gemm_universal/**/*.hpp
		)
		shopt -u globstar

		inst_path="${D}$(python_get_sitedir)/ck4inductor"
		for file in "${package_data[@]}"; do
			location="${inst_path}/$(dirname "$file")"
			mkdir -p "${location}"
			cp "${file}" "${location}"
		done
	}
	python_foreach_impl installation
}

src_test() {
	check_amdgpu
	LD_LIBRARY_PATH="${BUILD_DIR}"/lib cmake_src_test -j1
}