Line data Source code
1 : /**************************************************************************
2 : *
3 : * Copyright (c) 2006-2007 Tungsten Graphics, Inc., Cedar Park, TX., USA
4 : * All Rights Reserved.
5 : *
6 : * Permission is hereby granted, free of charge, to any person obtaining a
7 : * copy of this software and associated documentation files (the
8 : * "Software"), to deal in the Software without restriction, including
9 : * without limitation the rights to use, copy, modify, merge, publish,
10 : * distribute, sub license, and/or sell copies of the Software, and to
11 : * permit persons to whom the Software is furnished to do so, subject to
12 : * the following conditions:
13 : *
14 : * The above copyright notice and this permission notice (including the
15 : * next paragraph) shall be included in all copies or substantial portions
16 : * of the Software.
17 : *
18 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 : * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 : * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 : * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 : * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 : * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 : * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 : *
26 : **************************************************************************/
27 : /*
28 : * Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com>
29 : */
30 : #include <linux/cc_platform.h>
31 : #include <linux/export.h>
32 : #include <linux/highmem.h>
33 : #include <linux/ioport.h>
34 : #include <linux/iosys-map.h>
35 : #include <xen/xen.h>
36 :
37 : #include <drm/drm_cache.h>
38 :
39 : /* A small bounce buffer that fits on the stack. */
40 : #define MEMCPY_BOUNCE_SIZE 128
41 :
42 : #if defined(CONFIG_X86)
43 : #include <asm/smp.h>
44 :
45 : /*
46 : * clflushopt is an unordered instruction which needs fencing with mfence or
47 : * sfence to avoid ordering issues. For drm_clflush_page this fencing happens
48 : * in the caller.
49 : */
50 : static void
51 : drm_clflush_page(struct page *page)
52 : {
53 : uint8_t *page_virtual;
54 : unsigned int i;
55 : const int size = boot_cpu_data.x86_clflush_size;
56 :
57 : if (unlikely(page == NULL))
58 : return;
59 :
60 : page_virtual = kmap_atomic(page);
61 : for (i = 0; i < PAGE_SIZE; i += size)
62 : clflushopt(page_virtual + i);
63 : kunmap_atomic(page_virtual);
64 : }
65 :
66 : static void drm_cache_flush_clflush(struct page *pages[],
67 : unsigned long num_pages)
68 : {
69 : unsigned long i;
70 :
71 : mb(); /*Full memory barrier used before so that CLFLUSH is ordered*/
72 : for (i = 0; i < num_pages; i++)
73 : drm_clflush_page(*pages++);
74 : mb(); /*Also used after CLFLUSH so that all cache is flushed*/
75 : }
76 : #endif
77 :
78 : /**
79 : * drm_clflush_pages - Flush dcache lines of a set of pages.
80 : * @pages: List of pages to be flushed.
81 : * @num_pages: Number of pages in the array.
82 : *
83 : * Flush every data cache line entry that points to an address belonging
84 : * to a page in the array.
85 : */
86 : void
87 0 : drm_clflush_pages(struct page *pages[], unsigned long num_pages)
88 : {
89 :
90 : #if defined(CONFIG_X86)
91 : if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
92 : drm_cache_flush_clflush(pages, num_pages);
93 : return;
94 : }
95 :
96 : if (wbinvd_on_all_cpus())
97 : pr_err("Timed out waiting for cache flush\n");
98 :
99 : #elif defined(__powerpc__)
100 : unsigned long i;
101 :
102 : for (i = 0; i < num_pages; i++) {
103 : struct page *page = pages[i];
104 : void *page_virtual;
105 :
106 : if (unlikely(page == NULL))
107 : continue;
108 :
109 : page_virtual = kmap_atomic(page);
110 : flush_dcache_range((unsigned long)page_virtual,
111 : (unsigned long)page_virtual + PAGE_SIZE);
112 : kunmap_atomic(page_virtual);
113 : }
114 : #else
115 0 : WARN_ONCE(1, "Architecture has no drm_cache.c support\n");
116 : #endif
117 0 : }
118 : EXPORT_SYMBOL(drm_clflush_pages);
119 :
120 : /**
121 : * drm_clflush_sg - Flush dcache lines pointing to a scather-gather.
122 : * @st: struct sg_table.
123 : *
124 : * Flush every data cache line entry that points to an address in the
125 : * sg.
126 : */
127 : void
128 0 : drm_clflush_sg(struct sg_table *st)
129 : {
130 : #if defined(CONFIG_X86)
131 : if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
132 : struct sg_page_iter sg_iter;
133 :
134 : mb(); /*CLFLUSH is ordered only by using memory barriers*/
135 : for_each_sgtable_page(st, &sg_iter, 0)
136 : drm_clflush_page(sg_page_iter_page(&sg_iter));
137 : mb(); /*Make sure that all cache line entry is flushed*/
138 :
139 : return;
140 : }
141 :
142 : if (wbinvd_on_all_cpus())
143 : pr_err("Timed out waiting for cache flush\n");
144 : #else
145 0 : WARN_ONCE(1, "Architecture has no drm_cache.c support\n");
146 : #endif
147 0 : }
148 : EXPORT_SYMBOL(drm_clflush_sg);
149 :
150 : /**
151 : * drm_clflush_virt_range - Flush dcache lines of a region
152 : * @addr: Initial kernel memory address.
153 : * @length: Region size.
154 : *
155 : * Flush every data cache line entry that points to an address in the
156 : * region requested.
157 : */
158 : void
159 0 : drm_clflush_virt_range(void *addr, unsigned long length)
160 : {
161 : #if defined(CONFIG_X86)
162 : if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
163 : const int size = boot_cpu_data.x86_clflush_size;
164 : void *end = addr + length;
165 :
166 : addr = (void *)(((unsigned long)addr) & -size);
167 : mb(); /*CLFLUSH is only ordered with a full memory barrier*/
168 : for (; addr < end; addr += size)
169 : clflushopt(addr);
170 : clflushopt(end - 1); /* force serialisation */
171 : mb(); /*Ensure that every data cache line entry is flushed*/
172 : return;
173 : }
174 :
175 : if (wbinvd_on_all_cpus())
176 : pr_err("Timed out waiting for cache flush\n");
177 : #else
178 0 : WARN_ONCE(1, "Architecture has no drm_cache.c support\n");
179 : #endif
180 0 : }
181 : EXPORT_SYMBOL(drm_clflush_virt_range);
182 :
183 0 : bool drm_need_swiotlb(int dma_bits)
184 : {
185 : struct resource *tmp;
186 0 : resource_size_t max_iomem = 0;
187 :
188 : /*
189 : * Xen paravirtual hosts require swiotlb regardless of requested dma
190 : * transfer size.
191 : *
192 : * NOTE: Really, what it requires is use of the dma_alloc_coherent
193 : * allocator used in ttm_dma_populate() instead of
194 : * ttm_populate_and_map_pages(), which bounce buffers so much in
195 : * Xen it leads to swiotlb buffer exhaustion.
196 : */
197 : if (xen_pv_domain())
198 : return true;
199 :
200 : /*
201 : * Enforce dma_alloc_coherent when memory encryption is active as well
202 : * for the same reasons as for Xen paravirtual hosts.
203 : */
204 0 : if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
205 : return true;
206 :
207 0 : for (tmp = iomem_resource.child; tmp; tmp = tmp->sibling)
208 0 : max_iomem = max(max_iomem, tmp->end);
209 :
210 0 : return max_iomem > ((u64)1 << dma_bits);
211 : }
212 : EXPORT_SYMBOL(drm_need_swiotlb);
213 :
214 0 : static void memcpy_fallback(struct iosys_map *dst,
215 : const struct iosys_map *src,
216 : unsigned long len)
217 : {
218 0 : if (!dst->is_iomem && !src->is_iomem) {
219 0 : memcpy(dst->vaddr, src->vaddr, len);
220 0 : } else if (!src->is_iomem) {
221 0 : iosys_map_memcpy_to(dst, 0, src->vaddr, len);
222 0 : } else if (!dst->is_iomem) {
223 0 : memcpy_fromio(dst->vaddr, src->vaddr_iomem, len);
224 : } else {
225 : /*
226 : * Bounce size is not performance tuned, but using a
227 : * bounce buffer like this is significantly faster than
228 : * resorting to ioreadxx() + iowritexx().
229 : */
230 : char bounce[MEMCPY_BOUNCE_SIZE];
231 0 : void __iomem *_src = src->vaddr_iomem;
232 0 : void __iomem *_dst = dst->vaddr_iomem;
233 :
234 0 : while (len >= MEMCPY_BOUNCE_SIZE) {
235 0 : memcpy_fromio(bounce, _src, MEMCPY_BOUNCE_SIZE);
236 0 : memcpy_toio(_dst, bounce, MEMCPY_BOUNCE_SIZE);
237 0 : _src += MEMCPY_BOUNCE_SIZE;
238 0 : _dst += MEMCPY_BOUNCE_SIZE;
239 0 : len -= MEMCPY_BOUNCE_SIZE;
240 : }
241 0 : if (len) {
242 0 : memcpy_fromio(bounce, _src, MEMCPY_BOUNCE_SIZE);
243 0 : memcpy_toio(_dst, bounce, MEMCPY_BOUNCE_SIZE);
244 : }
245 : }
246 0 : }
247 :
248 : #ifdef CONFIG_X86
249 :
250 : static DEFINE_STATIC_KEY_FALSE(has_movntdqa);
251 :
252 : static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
253 : {
254 : kernel_fpu_begin();
255 :
256 : while (len >= 4) {
257 : asm("movntdqa (%0), %%xmm0\n"
258 : "movntdqa 16(%0), %%xmm1\n"
259 : "movntdqa 32(%0), %%xmm2\n"
260 : "movntdqa 48(%0), %%xmm3\n"
261 : "movaps %%xmm0, (%1)\n"
262 : "movaps %%xmm1, 16(%1)\n"
263 : "movaps %%xmm2, 32(%1)\n"
264 : "movaps %%xmm3, 48(%1)\n"
265 : :: "r" (src), "r" (dst) : "memory");
266 : src += 64;
267 : dst += 64;
268 : len -= 4;
269 : }
270 : while (len--) {
271 : asm("movntdqa (%0), %%xmm0\n"
272 : "movaps %%xmm0, (%1)\n"
273 : :: "r" (src), "r" (dst) : "memory");
274 : src += 16;
275 : dst += 16;
276 : }
277 :
278 : kernel_fpu_end();
279 : }
280 :
281 : /*
282 : * __drm_memcpy_from_wc copies @len bytes from @src to @dst using
283 : * non-temporal instructions where available. Note that all arguments
284 : * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
285 : * of 16.
286 : */
287 : static void __drm_memcpy_from_wc(void *dst, const void *src, unsigned long len)
288 : {
289 : if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
290 : memcpy(dst, src, len);
291 : else if (likely(len))
292 : __memcpy_ntdqa(dst, src, len >> 4);
293 : }
294 :
295 : /**
296 : * drm_memcpy_from_wc - Perform the fastest available memcpy from a source
297 : * that may be WC.
298 : * @dst: The destination pointer
299 : * @src: The source pointer
300 : * @len: The size of the area o transfer in bytes
301 : *
302 : * Tries an arch optimized memcpy for prefetching reading out of a WC region,
303 : * and if no such beast is available, falls back to a normal memcpy.
304 : */
305 : void drm_memcpy_from_wc(struct iosys_map *dst,
306 : const struct iosys_map *src,
307 : unsigned long len)
308 : {
309 : if (WARN_ON(in_interrupt())) {
310 : memcpy_fallback(dst, src, len);
311 : return;
312 : }
313 :
314 : if (static_branch_likely(&has_movntdqa)) {
315 : __drm_memcpy_from_wc(dst->is_iomem ?
316 : (void __force *)dst->vaddr_iomem :
317 : dst->vaddr,
318 : src->is_iomem ?
319 : (void const __force *)src->vaddr_iomem :
320 : src->vaddr,
321 : len);
322 : return;
323 : }
324 :
325 : memcpy_fallback(dst, src, len);
326 : }
327 : EXPORT_SYMBOL(drm_memcpy_from_wc);
328 :
329 : /*
330 : * drm_memcpy_init_early - One time initialization of the WC memcpy code
331 : */
332 : void drm_memcpy_init_early(void)
333 : {
334 : /*
335 : * Some hypervisors (e.g. KVM) don't support VEX-prefix instructions
336 : * emulation. So don't enable movntdqa in hypervisor guest.
337 : */
338 : if (static_cpu_has(X86_FEATURE_XMM4_1) &&
339 : !boot_cpu_has(X86_FEATURE_HYPERVISOR))
340 : static_branch_enable(&has_movntdqa);
341 : }
342 : #else
343 0 : void drm_memcpy_from_wc(struct iosys_map *dst,
344 : const struct iosys_map *src,
345 : unsigned long len)
346 : {
347 0 : WARN_ON(in_interrupt());
348 :
349 0 : memcpy_fallback(dst, src, len);
350 0 : }
351 : EXPORT_SYMBOL(drm_memcpy_from_wc);
352 :
353 1 : void drm_memcpy_init_early(void)
354 : {
355 1 : }
356 : #endif /* CONFIG_X86 */
|