Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2016 Thomas Gleixner.
4 : * Copyright (C) 2016-2017 Christoph Hellwig.
5 : */
6 : #include <linux/kernel.h>
7 : #include <linux/slab.h>
8 : #include <linux/cpu.h>
9 : #include <linux/sort.h>
10 : #include <linux/group_cpus.h>
11 :
12 : #ifdef CONFIG_SMP
13 :
14 : static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
15 : unsigned int cpus_per_grp)
16 : {
17 : const struct cpumask *siblmsk;
18 : int cpu, sibl;
19 :
20 : for ( ; cpus_per_grp > 0; ) {
21 : cpu = cpumask_first(nmsk);
22 :
23 : /* Should not happen, but I'm too lazy to think about it */
24 : if (cpu >= nr_cpu_ids)
25 : return;
26 :
27 : cpumask_clear_cpu(cpu, nmsk);
28 : cpumask_set_cpu(cpu, irqmsk);
29 : cpus_per_grp--;
30 :
31 : /* If the cpu has siblings, use them first */
32 : siblmsk = topology_sibling_cpumask(cpu);
33 : for (sibl = -1; cpus_per_grp > 0; ) {
34 : sibl = cpumask_next(sibl, siblmsk);
35 : if (sibl >= nr_cpu_ids)
36 : break;
37 : if (!cpumask_test_and_clear_cpu(sibl, nmsk))
38 : continue;
39 : cpumask_set_cpu(sibl, irqmsk);
40 : cpus_per_grp--;
41 : }
42 : }
43 : }
44 :
45 : static cpumask_var_t *alloc_node_to_cpumask(void)
46 : {
47 : cpumask_var_t *masks;
48 : int node;
49 :
50 : masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
51 : if (!masks)
52 : return NULL;
53 :
54 : for (node = 0; node < nr_node_ids; node++) {
55 : if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
56 : goto out_unwind;
57 : }
58 :
59 : return masks;
60 :
61 : out_unwind:
62 : while (--node >= 0)
63 : free_cpumask_var(masks[node]);
64 : kfree(masks);
65 : return NULL;
66 : }
67 :
68 : static void free_node_to_cpumask(cpumask_var_t *masks)
69 : {
70 : int node;
71 :
72 : for (node = 0; node < nr_node_ids; node++)
73 : free_cpumask_var(masks[node]);
74 : kfree(masks);
75 : }
76 :
77 : static void build_node_to_cpumask(cpumask_var_t *masks)
78 : {
79 : int cpu;
80 :
81 : for_each_possible_cpu(cpu)
82 : cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
83 : }
84 :
85 : static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
86 : const struct cpumask *mask, nodemask_t *nodemsk)
87 : {
88 : int n, nodes = 0;
89 :
90 : /* Calculate the number of nodes in the supplied affinity mask */
91 : for_each_node(n) {
92 : if (cpumask_intersects(mask, node_to_cpumask[n])) {
93 : node_set(n, *nodemsk);
94 : nodes++;
95 : }
96 : }
97 : return nodes;
98 : }
99 :
100 : struct node_groups {
101 : unsigned id;
102 :
103 : union {
104 : unsigned ngroups;
105 : unsigned ncpus;
106 : };
107 : };
108 :
109 : static int ncpus_cmp_func(const void *l, const void *r)
110 : {
111 : const struct node_groups *ln = l;
112 : const struct node_groups *rn = r;
113 :
114 : return ln->ncpus - rn->ncpus;
115 : }
116 :
117 : /*
118 : * Allocate group number for each node, so that for each node:
119 : *
120 : * 1) the allocated number is >= 1
121 : *
122 : * 2) the allocated number is <= active CPU number of this node
123 : *
124 : * The actual allocated total groups may be less than @numgrps when
125 : * active total CPU number is less than @numgrps.
126 : *
127 : * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
128 : * for each node.
129 : */
130 : static void alloc_nodes_groups(unsigned int numgrps,
131 : cpumask_var_t *node_to_cpumask,
132 : const struct cpumask *cpu_mask,
133 : const nodemask_t nodemsk,
134 : struct cpumask *nmsk,
135 : struct node_groups *node_groups)
136 : {
137 : unsigned n, remaining_ncpus = 0;
138 :
139 : for (n = 0; n < nr_node_ids; n++) {
140 : node_groups[n].id = n;
141 : node_groups[n].ncpus = UINT_MAX;
142 : }
143 :
144 : for_each_node_mask(n, nodemsk) {
145 : unsigned ncpus;
146 :
147 : cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
148 : ncpus = cpumask_weight(nmsk);
149 :
150 : if (!ncpus)
151 : continue;
152 : remaining_ncpus += ncpus;
153 : node_groups[n].ncpus = ncpus;
154 : }
155 :
156 : numgrps = min_t(unsigned, remaining_ncpus, numgrps);
157 :
158 : sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
159 : ncpus_cmp_func, NULL);
160 :
161 : /*
162 : * Allocate groups for each node according to the ratio of this
163 : * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is
164 : * bigger than number of active numa nodes. Always start the
165 : * allocation from the node with minimized nr_cpus.
166 : *
167 : * This way guarantees that each active node gets allocated at
168 : * least one group, and the theory is simple: over-allocation
169 : * is only done when this node is assigned by one group, so
170 : * other nodes will be allocated >= 1 groups, since 'numgrps' is
171 : * bigger than number of numa nodes.
172 : *
173 : * One perfect invariant is that number of allocated groups for
174 : * each node is <= CPU count of this node:
175 : *
176 : * 1) suppose there are two nodes: A and B
177 : * ncpu(X) is CPU count of node X
178 : * grps(X) is the group count allocated to node X via this
179 : * algorithm
180 : *
181 : * ncpu(A) <= ncpu(B)
182 : * ncpu(A) + ncpu(B) = N
183 : * grps(A) + grps(B) = G
184 : *
185 : * grps(A) = max(1, round_down(G * ncpu(A) / N))
186 : * grps(B) = G - grps(A)
187 : *
188 : * both N and G are integer, and 2 <= G <= N, suppose
189 : * G = N - delta, and 0 <= delta <= N - 2
190 : *
191 : * 2) obviously grps(A) <= ncpu(A) because:
192 : *
193 : * if grps(A) is 1, then grps(A) <= ncpu(A) given
194 : * ncpu(A) >= 1
195 : *
196 : * otherwise,
197 : * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N
198 : *
199 : * 3) prove how grps(B) <= ncpu(B):
200 : *
201 : * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be
202 : * over-allocated, so grps(B) <= ncpu(B),
203 : *
204 : * otherwise:
205 : *
206 : * grps(A) =
207 : * round_down(G * ncpu(A) / N) =
208 : * round_down((N - delta) * ncpu(A) / N) =
209 : * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
210 : * round_down((N * ncpu(A) - delta * N) / N) =
211 : * cpu(A) - delta
212 : *
213 : * then:
214 : *
215 : * grps(A) - G >= ncpu(A) - delta - G
216 : * =>
217 : * G - grps(A) <= G + delta - ncpu(A)
218 : * =>
219 : * grps(B) <= N - ncpu(A)
220 : * =>
221 : * grps(B) <= cpu(B)
222 : *
223 : * For nodes >= 3, it can be thought as one node and another big
224 : * node given that is exactly what this algorithm is implemented,
225 : * and we always re-calculate 'remaining_ncpus' & 'numgrps', and
226 : * finally for each node X: grps(X) <= ncpu(X).
227 : *
228 : */
229 : for (n = 0; n < nr_node_ids; n++) {
230 : unsigned ngroups, ncpus;
231 :
232 : if (node_groups[n].ncpus == UINT_MAX)
233 : continue;
234 :
235 : WARN_ON_ONCE(numgrps == 0);
236 :
237 : ncpus = node_groups[n].ncpus;
238 : ngroups = max_t(unsigned, 1,
239 : numgrps * ncpus / remaining_ncpus);
240 : WARN_ON_ONCE(ngroups > ncpus);
241 :
242 : node_groups[n].ngroups = ngroups;
243 :
244 : remaining_ncpus -= ncpus;
245 : numgrps -= ngroups;
246 : }
247 : }
248 :
249 : static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
250 : cpumask_var_t *node_to_cpumask,
251 : const struct cpumask *cpu_mask,
252 : struct cpumask *nmsk, struct cpumask *masks)
253 : {
254 : unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
255 : unsigned int last_grp = numgrps;
256 : unsigned int curgrp = startgrp;
257 : nodemask_t nodemsk = NODE_MASK_NONE;
258 : struct node_groups *node_groups;
259 :
260 : if (cpumask_empty(cpu_mask))
261 : return 0;
262 :
263 : nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
264 :
265 : /*
266 : * If the number of nodes in the mask is greater than or equal the
267 : * number of groups we just spread the groups across the nodes.
268 : */
269 : if (numgrps <= nodes) {
270 : for_each_node_mask(n, nodemsk) {
271 : /* Ensure that only CPUs which are in both masks are set */
272 : cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
273 : cpumask_or(&masks[curgrp], &masks[curgrp], nmsk);
274 : if (++curgrp == last_grp)
275 : curgrp = 0;
276 : }
277 : return numgrps;
278 : }
279 :
280 : node_groups = kcalloc(nr_node_ids,
281 : sizeof(struct node_groups),
282 : GFP_KERNEL);
283 : if (!node_groups)
284 : return -ENOMEM;
285 :
286 : /* allocate group number for each node */
287 : alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
288 : nodemsk, nmsk, node_groups);
289 : for (i = 0; i < nr_node_ids; i++) {
290 : unsigned int ncpus, v;
291 : struct node_groups *nv = &node_groups[i];
292 :
293 : if (nv->ngroups == UINT_MAX)
294 : continue;
295 :
296 : /* Get the cpus on this node which are in the mask */
297 : cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
298 : ncpus = cpumask_weight(nmsk);
299 : if (!ncpus)
300 : continue;
301 :
302 : WARN_ON_ONCE(nv->ngroups > ncpus);
303 :
304 : /* Account for rounding errors */
305 : extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
306 :
307 : /* Spread allocated groups on CPUs of the current node */
308 : for (v = 0; v < nv->ngroups; v++, curgrp++) {
309 : cpus_per_grp = ncpus / nv->ngroups;
310 :
311 : /* Account for extra groups to compensate rounding errors */
312 : if (extra_grps) {
313 : cpus_per_grp++;
314 : --extra_grps;
315 : }
316 :
317 : /*
318 : * wrapping has to be considered given 'startgrp'
319 : * may start anywhere
320 : */
321 : if (curgrp >= last_grp)
322 : curgrp = 0;
323 : grp_spread_init_one(&masks[curgrp], nmsk,
324 : cpus_per_grp);
325 : }
326 : done += nv->ngroups;
327 : }
328 : kfree(node_groups);
329 : return done;
330 : }
331 :
332 : /**
333 : * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
334 : * @numgrps: number of groups
335 : *
336 : * Return: cpumask array if successful, NULL otherwise. And each element
337 : * includes CPUs assigned to this group
338 : *
339 : * Try to put close CPUs from viewpoint of CPU and NUMA locality into
340 : * same group, and run two-stage grouping:
341 : * 1) allocate present CPUs on these groups evenly first
342 : * 2) allocate other possible CPUs on these groups evenly
343 : *
344 : * We guarantee in the resulted grouping that all CPUs are covered, and
345 : * no same CPU is assigned to multiple groups
346 : */
347 : struct cpumask *group_cpus_evenly(unsigned int numgrps)
348 : {
349 : unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
350 : cpumask_var_t *node_to_cpumask;
351 : cpumask_var_t nmsk, npresmsk;
352 : int ret = -ENOMEM;
353 : struct cpumask *masks = NULL;
354 :
355 : if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
356 : return NULL;
357 :
358 : if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
359 : goto fail_nmsk;
360 :
361 : node_to_cpumask = alloc_node_to_cpumask();
362 : if (!node_to_cpumask)
363 : goto fail_npresmsk;
364 :
365 : masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
366 : if (!masks)
367 : goto fail_node_to_cpumask;
368 :
369 : /* Stabilize the cpumasks */
370 : cpus_read_lock();
371 : build_node_to_cpumask(node_to_cpumask);
372 :
373 : /* grouping present CPUs first */
374 : ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
375 : cpu_present_mask, nmsk, masks);
376 : if (ret < 0)
377 : goto fail_build_affinity;
378 : nr_present = ret;
379 :
380 : /*
381 : * Allocate non present CPUs starting from the next group to be
382 : * handled. If the grouping of present CPUs already exhausted the
383 : * group space, assign the non present CPUs to the already
384 : * allocated out groups.
385 : */
386 : if (nr_present >= numgrps)
387 : curgrp = 0;
388 : else
389 : curgrp = nr_present;
390 : cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
391 : ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
392 : npresmsk, nmsk, masks);
393 : if (ret >= 0)
394 : nr_others = ret;
395 :
396 : fail_build_affinity:
397 : cpus_read_unlock();
398 :
399 : if (ret >= 0)
400 : WARN_ON(nr_present + nr_others < numgrps);
401 :
402 : fail_node_to_cpumask:
403 : free_node_to_cpumask(node_to_cpumask);
404 :
405 : fail_npresmsk:
406 : free_cpumask_var(npresmsk);
407 :
408 : fail_nmsk:
409 : free_cpumask_var(nmsk);
410 : if (ret < 0) {
411 : kfree(masks);
412 : return NULL;
413 : }
414 : return masks;
415 : }
416 : #else /* CONFIG_SMP */
417 0 : struct cpumask *group_cpus_evenly(unsigned int numgrps)
418 : {
419 0 : struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
420 :
421 0 : if (!masks)
422 : return NULL;
423 :
424 : /* assign all CPUs(cpu 0) to the 1st group only */
425 0 : cpumask_copy(&masks[0], cpu_possible_mask);
426 0 : return masks;
427 : }
428 : #endif /* CONFIG_SMP */
|