root / volk / spu_lib / spu_memcpy_unaligned.c @ cacfd391
History | View | Annotate | Download (10.3 kB)
| 1 | #include<libvector/libvector_memcpy_unaligned.h |
|---|---|
| 2 | #include<spu_intrinsics.h> |
| 3 | |
| 4 | void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes){ |
| 5 | //loop iterator i
|
| 6 | int i = 0; |
| 7 | void* retval = target;
|
| 8 | |
| 9 | |
| 10 | //put the target and source addresses into qwords
|
| 11 | vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; |
| 12 | vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0}; |
| 13 | |
| 14 | //create shuffle masks
|
| 15 | |
| 16 | //shuffle mask building blocks:
|
| 17 | //all from the first vector
|
| 18 | vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 19 | 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; |
| 20 | //all from the second vector
|
| 21 | vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, |
| 22 | 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; |
| 23 | |
| 24 | |
| 25 | |
| 26 | //gamma: second half of the second, first half of the first, break at (unsigned int)src%16
|
| 27 | vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16)); |
| 28 | vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); |
| 29 | vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); |
| 30 | vector unsigned char cmp_res = spu_or(gt_res, eq_res); |
| 31 | vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); |
| 32 | vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); |
| 33 | vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, |
| 34 | (vector unsigned int)oneup); |
| 35 | shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16); |
| 36 | |
| 37 | |
| 38 | |
| 39 | |
| 40 | vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); |
| 41 | vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); |
| 42 | |
| 43 | //alpha: first half of first, second half of second, break at (unsigned int)target%16
|
| 44 | src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); |
| 45 | gt_res = spu_cmpgt(oneup, src_cmp); |
| 46 | eq_res = spu_cmpeq(oneup, src_cmp); |
| 47 | cmp_res = spu_or(gt_res, eq_res); |
| 48 | phase_change = spu_and(sixteen_uchar, cmp_res); |
| 49 | vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, |
| 50 | (vector unsigned int)oneup); |
| 51 | |
| 52 | //delta: first half of first, first half of second, break at (unsigned int)target%16
|
| 53 | vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); |
| 54 | //epsilon: second half of second, second half of first, break at (unsigned int)target%16
|
| 55 | vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); |
| 56 | //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
|
| 57 | vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); |
| 58 | |
| 59 | //beta: first half of first, second half of second, break at num_bytes%16
|
| 60 | src_cmp = spu_splats((unsigned char)(num_bytes%16)); |
| 61 | gt_res = spu_cmpgt(oneup, src_cmp); |
| 62 | eq_res = spu_cmpeq(oneup, src_cmp); |
| 63 | cmp_res = spu_or(gt_res, eq_res); |
| 64 | phase_change = spu_and(sixteen_uchar, cmp_res); |
| 65 | vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, |
| 66 | (vector unsigned int)oneup); |
| 67 | |
| 68 | |
| 69 | |
| 70 | |
| 71 | |
| 72 | |
| 73 | qword src_past; |
| 74 | qword src_present; |
| 75 | qword tgt_past; |
| 76 | qword tgt_present; |
| 77 | |
| 78 | qword in_temp; |
| 79 | qword out_temp0; |
| 80 | qword out_temp1; |
| 81 | |
| 82 | src_past = si_lqd((qword)address_counter_src, 0);
|
| 83 | tgt_past = si_lqd((qword)address_counter_tgt, 0);
|
| 84 | |
| 85 | for(i = 0; i < num_bytes/16; ++i) { |
| 86 | |
| 87 | src_present = si_lqd((qword)address_counter_src, 16);
|
| 88 | tgt_present = si_lqd((qword)address_counter_tgt, 16);
|
| 89 | |
| 90 | in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma); |
| 91 | |
| 92 | out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta); |
| 93 | out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon); |
| 94 | |
| 95 | si_stqd(out_temp0, (qword)address_counter_tgt, 0);
|
| 96 | si_stqd(out_temp1, (qword)address_counter_tgt, 16);
|
| 97 | |
| 98 | tgt_past = out_temp1; |
| 99 | src_past = src_present; |
| 100 | address_counter_src = spu_add(address_counter_src, 16);
|
| 101 | address_counter_tgt = spu_add(address_counter_tgt, 16);
|
| 102 | |
| 103 | |
| 104 | } |
| 105 | |
| 106 | src_present = si_lqd((qword)address_counter_src, 16);
|
| 107 | tgt_present = si_lqd((qword)address_counter_tgt, 16);
|
| 108 | |
| 109 | |
| 110 | in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma); |
| 111 | qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); |
| 112 | qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta); |
| 113 | |
| 114 | |
| 115 | |
| 116 | out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); |
| 117 | out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); |
| 118 | |
| 119 | si_stqd(out_temp0, (qword)address_counter_tgt, 0);
|
| 120 | si_stqd(out_temp1, (qword)address_counter_tgt, 16);
|
| 121 | |
| 122 | return retval;
|
| 123 | } |
| 124 | |
| 125 | |
| 126 | |
| 127 | /*
|
| 128 | void* mcpy(void* target, void* src, size_t num_bytes){
|
| 129 | //loop iterator i |
| 130 | int i = 0; |
| 131 | void* retval = src; |
| 132 | |
| 133 | //put the target and source addresses into qwords |
| 134 | vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
|
| 135 | vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
|
| 136 | |
| 137 | //create shuffle masks |
| 138 | |
| 139 | //shuffle mask building blocks: |
| 140 | //all from the first vector |
| 141 | vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
| 142 | 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; |
| 143 | //all from the second vector |
| 144 | vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
|
| 145 | 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; |
| 146 | |
| 147 | |
| 148 | |
| 149 | //gamma: second half of the second, first half of the first, break at src%16 |
| 150 | vector unsigned char src_cmp = spu_splats((unsigned char)(src%16)); |
| 151 | vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); |
| 152 | vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); |
| 153 | vector unsigned char cmp_res = spu_or(gt_res, eq_res); |
| 154 | vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); |
| 155 | vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); |
| 156 | vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, |
| 157 | (vector unsigned int)oneup); |
| 158 | shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, src%16); |
| 159 | |
| 160 | |
| 161 | |
| 162 | |
| 163 | vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -(target%16)); |
| 164 | vector unsigned char tgt_first = spu_rlqwbyte(oneup, -(target%16)); |
| 165 | |
| 166 | //alpha: first half of first, second half of second, break at target%16 |
| 167 | src_cmp = spu_splats((unsigned char)(target%16)); |
| 168 | gt_res = spu_cmpgt(oneup, src_cmp); |
| 169 | eq_res = spu_cmpeq(oneup, src_cmp); |
| 170 | cmp_res = spu_or(gt_res, eq_res); |
| 171 | phase_change = spu_and(sixteen_uchar, cmp_res); |
| 172 | vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, |
| 173 | (vector unsigned int)oneup); |
| 174 | |
| 175 | //delta: first half of first, first half of second, break at target%16 |
| 176 | vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); |
| 177 | //epsilon: second half of second, second half of first, break at target%16 |
| 178 | vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); |
| 179 | //zeta: second half of second, first half of first, break at 16 - target%16 |
| 180 | vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16); |
| 181 | |
| 182 | //beta: first half of first, second half of second, break at num_bytes%16 |
| 183 | src_cmp = spu_splats((unsigned char)(num_bytes%16)); |
| 184 | gt_res = spu_cmpgt(oneup, src_cmp); |
| 185 | eq_res = spu_cmpeq(oneup, src_cmp); |
| 186 | cmp_res = spu_or(gt_res, eq_res); |
| 187 | phase_change = spu_and(sixteen_uchar, cmp_res); |
| 188 | vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, |
| 189 | (vector unsigned int)oneup); |
| 190 | |
| 191 | |
| 192 | printf("num_bytesmod16 %d\n", num_bytes%16);
|
| 193 | printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n",
|
| 194 | spu_extract((vector unsigned char) shuffle_mask_beta, 0), |
| 195 | spu_extract((vector unsigned char) shuffle_mask_beta, 1), |
| 196 | spu_extract((vector unsigned char) shuffle_mask_beta, 2), |
| 197 | spu_extract((vector unsigned char) shuffle_mask_beta, 3), |
| 198 | spu_extract((vector unsigned char) shuffle_mask_beta, 4), |
| 199 | spu_extract((vector unsigned char) shuffle_mask_beta, 5), |
| 200 | spu_extract((vector unsigned char) shuffle_mask_beta, 6), |
| 201 | spu_extract((vector unsigned char) shuffle_mask_beta, 7), |
| 202 | spu_extract((vector unsigned char) shuffle_mask_beta, 8), |
| 203 | spu_extract((vector unsigned char) shuffle_mask_beta, 9), |
| 204 | spu_extract((vector unsigned char) shuffle_mask_beta, 10), |
| 205 | spu_extract((vector unsigned char) shuffle_mask_beta, 11), |
| 206 | spu_extract((vector unsigned char) shuffle_mask_beta, 12), |
| 207 | spu_extract((vector unsigned char) shuffle_mask_beta, 13), |
| 208 | spu_extract((vector unsigned char) shuffle_mask_beta, 14), |
| 209 | spu_extract((vector unsigned char) shuffle_mask_beta, 15)); |
| 210 | |
| 211 | |
| 212 | |
| 213 | |
| 214 | |
| 215 | |
| 216 | |
| 217 | qword src_past; |
| 218 | qword src_present; |
| 219 | qword tgt_past; |
| 220 | qword tgt_present; |
| 221 | |
| 222 | qword in_temp; |
| 223 | qword out_temp0; |
| 224 | qword out_temp1; |
| 225 | |
| 226 | src_past = si_lqd((qword)address_counter_src, 0); |
| 227 | tgt_past = si_lqd((qword)address_counter_tgt, 0); |
| 228 | |
| 229 | for(i = 0; i < num_bytes/16; ++i) {
|
| 230 | |
| 231 | src_present = si_lqd((qword)address_counter_src, 16); |
| 232 | tgt_present = si_lqd((qword)address_counter_tgt, 16); |
| 233 | |
| 234 | in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma); |
| 235 | |
| 236 | out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta); |
| 237 | out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon); |
| 238 | |
| 239 | si_stqd(out_temp0, (qword)address_counter_tgt, 0); |
| 240 | si_stqd(out_temp1, (qword)address_counter_tgt, 16); |
| 241 | |
| 242 | tgt_past = out_temp1; |
| 243 | src_past = src_present; |
| 244 | address_counter_src = spu_add(address_counter_src, 16); |
| 245 | address_counter_tgt = spu_add(address_counter_tgt, 16); |
| 246 | |
| 247 | |
| 248 | } |
| 249 | |
| 250 | src_present = si_lqd((qword)address_counter_src, 16); |
| 251 | tgt_present = si_lqd((qword)address_counter_tgt, 16); |
| 252 | |
| 253 | |
| 254 | in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma); |
| 255 | qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); |
| 256 | qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta); |
| 257 | |
| 258 | |
| 259 | |
| 260 | out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); |
| 261 | out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); |
| 262 | |
| 263 | si_stqd(out_temp0, (qword)address_counter_tgt, 0); |
| 264 | si_stqd(out_temp1, (qword)address_counter_tgt, 16); |
| 265 | |
| 266 | return retval; |
| 267 | |
| 268 | } |
| 269 | */ |
| 270 | /*
|
| 271 | int main(){
|
| 272 | |
| 273 | unsigned char pooh[48]; |
| 274 | unsigned char bear[48]; |
| 275 | |
| 276 | int i = 0; |
| 277 | for(i = 0; i < 48; ++i){
|
| 278 | pooh[i] = i; |
| 279 | bear[i] = i; |
| 280 | } |
| 281 | |
| 282 | spu_mcpy(&pooh[9],&bear[3], 15); |
| 283 | |
| 284 | for(i = 0; i < 48; ++i) {
|
| 285 | printf("%d, ", pooh[i]);
|
| 286 | } |
| 287 | printf("\n");
|
| 288 | } |
| 289 | |
| 290 | */ |