Added IBM Cell SDK 2.x software_cache to Bullet/Extras. There is an option to enable it for the BulletMultiThreaded Cell version.

See USE_SOFTWARE_CACHE in Bullet\src\BulletMultiThreaded\SpuNarrowPhaseCollisionTask\SpuGatheringCollisionTask.cpp It improves the Bullet midphase collision detection (triangle/vertex fetch) The license is CommonPublicLicense-1.0, see included license docs.
2008-11-18 01:33:30 +00:00
parent 50f475feb5
commit dc8692ba94
26 changed files with 2554 additions and 0 deletions
--- a/Extras/software_cache/cache/include/nway-opt.h
+++ b/Extras/software_cache/cache/include/nway-opt.h
@@ -0,0 +1,153 @@
+/* --------------------------------------------------------------- */
+/* PLEASE DO NOT MODIFY THIS SECTION                               */
+/* This prolog section is automatically generated.                 */
+/*                                                                 */
+/* (C) Copyright 2001,2006,                                        */
+/* International Business Machines Corporation,                    */
+/*                                                                 */
+/* All Rights Reserved.                                            */
+/* --------------------------------------------------------------- */
+/* PROLOG END TAG zYx                                              */
+/* nway-opt.h
+ *
+ * Copyright (C) 2006 IBM Corp.
+ *
+ * "Optimized" lookup operations for n-way set associative
+ * software managed cache.
+ */
+#include <spu_intrinsics.h>
+
+#ifndef __SPE_CACHE_NWAY_OPT_H_
+#define __SPE_CACHE_NWAY_OPT_H_
+
+/* __spe_cache_rd
+ *      Look up and return data from the cache.  If the data
+ *      is not currently in cache then transfer it from main
+ *      storage.
+ *
+ *      This code uses a conditional branch to the cache miss
+ *      handler in the event that the requested data is not
+ *      in the cache.  A branch hint is used to avoid paying
+ *      the branch stall penalty.
+ */
+#define __spe_cache_rd(type, ea)				\
+({								\
+    int set, idx, lnum, byte;					\
+    type ret;							\
+    _spe_cache_nway_lookup_(ea, set, idx);			\
+	if (unlikely(idx < 0)) {					\
+		idx = _spe_cache_miss_(ea, set, -1);			\
+        spu_writech(22, SPE_CACHE_SET_TAGMASK(set));		\
+        spu_mfcstat(MFC_TAG_UPDATE_ALL);			\
+    } 								\
+    lnum = _spe_cacheline_num_(set, idx);			\
+    byte = _spe_cacheline_byte_offset_(ea);			\
+    ret = *((type *) (&spe_cache_mem[lnum + byte])); \
+	ret;							\
+})
+
+/**
+ * __spe_cache_rd_x4
+ *      Fetch four data elements from the cache.
+ *
+ *	This code uses one conditional branch in 
+ *	the event that any of the four elements
+ *	are missing.
+ *
+ *	On a miss, light weight locking is used to 
+ *	avoid casting out entries that were found.
+ *	Further, we wait just once for the transfers,
+ *	allowing for parallel [rather than serial]
+ *	transfers.
+ */
+
+#define __spe_cache_rd_x4(type, ea_x4)				\
+({								\
+    vector unsigned int missing;				\
+    unsigned int ms;						\
+    vector unsigned int cindex;					\
+    unsigned int d0, d1, d2, d3;				\
+    vector unsigned int s_x4;					\
+    vector signed int i_x4;					\
+    vector unsigned int ibyte, iline;				\
+    vector unsigned int ret;					\
+    unsigned int idx0, idx1, idx2, idx3;			\
+								\
+    _spe_cache_nway_lookup_x4(ea_x4, s_x4, i_x4);		\
+    missing = spu_rlmask ((vector unsigned int)i_x4, -8);	\
+    ms = spu_extract (spu_gather (missing), 0);			\
+								\
+    ibyte = _spe_cacheline_byte_offset_x4(ea_x4);		\
+								\
+    iline = _spe_cacheline_num_x4(s_x4, 			\
+	    			(vector unsigned int)i_x4);	\
+								\
+    cindex = spu_add (iline, ibyte);				\
+								\
+    idx0 = spu_extract (cindex, 0);				\
+    idx1 = spu_extract (cindex, 1);				\
+    idx2 = spu_extract (cindex, 2);				\
+    idx3 = spu_extract (cindex, 3);				\
+								\
+    d0 = *((type *) (&spe_cache_mem[idx0]));			\
+    d1 = *((type *) (&spe_cache_mem[idx1]));			\
+    d2 = *((type *) (&spe_cache_mem[idx2]));			\
+    d3 = *((type *) (&spe_cache_mem[idx3]));			\
+    								\
+    ret = _load_vec_uint4 (d0, d1, d2, d3);			\
+								\
+    if (unlikely(ms)) { 					\
+	int b0 = spu_extract (ibyte, 0);			\
+	int b1 = spu_extract (ibyte, 1);			\
+	int b2 = spu_extract (ibyte, 2);			\
+	int b3 = spu_extract (ibyte, 3);			\
+	int lnum0;						\
+	int lnum1;						\
+	int lnum2;						\
+	int lnum3;						\
+	int s0 = spu_extract (s_x4, 0);				\
+	int s1 = spu_extract (s_x4, 1);				\
+	int s2 = spu_extract (s_x4, 2);				\
+	int s3 = spu_extract (s_x4, 3);				\
+	int i0 = spu_extract (i_x4, 0);				\
+	int i1 = spu_extract (i_x4, 1);				\
+	int i2 = spu_extract (i_x4, 2);				\
+	int i3 = spu_extract (i_x4, 3);				\
+        unsigned int ea0 = spu_extract(ea_x4, 0);		\
+        unsigned int ea1 = spu_extract(ea_x4, 1);		\
+        unsigned int ea2 = spu_extract(ea_x4, 2);		\
+        unsigned int ea3 = spu_extract(ea_x4, 3);		\
+	int avail = -1;						\
+								\
+ 	avail &= ~(((i0 < 0) ? 0 : (1 << i0)) |			\
+ 		   ((i1 < 0) ? 0 : (1 << i1)) |			\
+ 		   ((i2 < 0) ? 0 : (1 << i2)) |			\
+ 		   ((i3 < 0) ? 0 : (1 << i3)));			\
+								\
+	i0 = _spe_cache_miss_(ea0, s0, avail);			\
+	avail &= ~(1 << i0);					\
+	i1 = _spe_cache_miss_(ea1, s1, avail);			\
+	avail &= ~(1 << i1);					\
+	i2 = _spe_cache_miss_(ea2, s2, avail);			\
+	avail &= ~(1 << i2);					\
+	i3 = _spe_cache_miss_(ea3, s3, avail);			\
+								\
+	lnum0 = _spe_cacheline_num_(s0, i0);			\
+	lnum1 = _spe_cacheline_num_(s1, i1);			\
+	lnum2 = _spe_cacheline_num_(s2, i2);			\
+	lnum3 = _spe_cacheline_num_(s3, i3);			\
+								\
+	spu_writech(22, SPE_CACHE_SET_TAGMASK(set));		\
+	spu_mfcstat(MFC_TAG_UPDATE_ALL);			\
+								\
+	d0 = *((type *) (&spe_cache_mem[lnum0 + b0]));		\
+	d1 = *((type *) (&spe_cache_mem[lnum1 + b1]));		\
+	d2 = *((type *) (&spe_cache_mem[lnum2 + b2]));		\
+	d3 = *((type *) (&spe_cache_mem[lnum3 + b3]));		\
+								\
+        ret = _load_vec_uint4 (d0, d1, d2, d3);			\
+    }								\
+    ret;							\
+})
+
+#endif /* _SPE_CACHE_NWAY_OPT_H_ */