summaryrefslogtreecommitdiff
path: root/utils/PerfectShuffle/PerfectShuffle.cpp
blob: d39414eede949118ca903f7c22ef02fc08bb6d1d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
//===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file computes an optimal sequence of instructions for doing all shuffles
// of two 4-element vectors.  With a release build and when configured to emit
// an altivec instruction table, this takes about 30s to run on a 2.7Ghz
// PowerPC G5.
//
//===----------------------------------------------------------------------===//

#include <cassert>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <vector>
struct Operator;

// Masks are 4-nibble hex numbers.  Values 0-7 in any nibble means that it takes
// an element from that value of the input vectors.  A value of 8 means the
// entry is undefined.

// Mask manipulation functions.
static inline unsigned short MakeMask(unsigned V0, unsigned V1,
                                      unsigned V2, unsigned V3) {
  return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4));
}

/// getMaskElt - Return element N of the specified mask.
static unsigned getMaskElt(unsigned Mask, unsigned Elt) {
  return (Mask >> ((3-Elt)*4)) & 0xF;
}

static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) {
  unsigned FieldShift = ((3-Elt)*4);
  return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift);
}

// Reject elements where the values are 9-15.
static bool isValidMask(unsigned short Mask) {
  unsigned short UndefBits = Mask & 0x8888;
  return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0;
}

/// hasUndefElements - Return true if any of the elements in the mask are undefs
///
static bool hasUndefElements(unsigned short Mask) {
  return (Mask & 0x8888) != 0;
}

/// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
/// including undef values..
static bool isOnlyLHSMask(unsigned short Mask) {
  return (Mask & 0x4444) == 0;
}

/// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
/// refer to the LHS only (for when one argument value is passed into the same
/// function twice).
#if 0
static unsigned short getLHSOnlyMask(unsigned short Mask) {
  return Mask & 0xBBBB;  // Keep only LHS and Undefs.
}
#endif

/// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
/// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
static unsigned getCompressedMask(unsigned short Mask) {
  return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 +
         getMaskElt(Mask, 2)*9     + getMaskElt(Mask, 3);
}

static void PrintMask(unsigned i, std::ostream &OS) {
  OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0)))
     << "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1)))
     << "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2)))
     << "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3)))
     << ">";
}

/// ShuffleVal - This represents a shufflevector operation.
struct ShuffleVal {
  unsigned Cost;  // Number of instrs used to generate this value.
  Operator *Op;   // The Operation used to generate this value.
  unsigned short Arg0, Arg1;  // Input operands for this value.

  ShuffleVal() : Cost(1000000) {}
};


/// ShufTab - This is the actual shuffle table that we are trying to generate.
///
static ShuffleVal ShufTab[65536];

/// TheOperators - All of the operators that this target supports.
static std::vector<Operator*> TheOperators;

/// Operator - This is a vector operation that is available for use.
struct Operator {
  unsigned short ShuffleMask;
  unsigned short OpNum;
  const char *Name;
  unsigned Cost;

  Operator(unsigned short shufflemask, const char *name, unsigned opnum,
           unsigned cost = 1)
    : ShuffleMask(shufflemask), OpNum(opnum), Name(name), Cost(cost) {
    TheOperators.push_back(this);
  }
  ~Operator() {
    assert(TheOperators.back() == this);
    TheOperators.pop_back();
  }

  bool isOnlyLHSOperator() const {
    return isOnlyLHSMask(ShuffleMask);
  }

  const char *getName() const { return Name; }
  unsigned getCost() const { return Cost; }

  unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) {
    // Extract the elements from LHSMask and RHSMask, as appropriate.
    unsigned Result = 0;
    for (unsigned i = 0; i != 4; ++i) {
      unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF;
      unsigned ResElt;
      if (SrcElt < 4)
        ResElt = getMaskElt(LHSMask, SrcElt);
      else if (SrcElt < 8)
        ResElt = getMaskElt(RHSMask, SrcElt-4);
      else {
        assert(SrcElt == 8 && "Bad src elt!");
        ResElt = 8;
      }
      Result |= ResElt << (4*i);
    }
    return Result;
  }
};

static const char *getZeroCostOpName(unsigned short Op) {
  if (ShufTab[Op].Arg0 == 0x0123)
    return "LHS";
  else if (ShufTab[Op].Arg0 == 0x4567)
    return "RHS";
  else {
    assert(0 && "bad zero cost operation");
    abort();
  }
}

static void PrintOperation(unsigned ValNo, unsigned short Vals[]) {
  unsigned short ThisOp = Vals[ValNo];
  std::cerr << "t" << ValNo;
  PrintMask(ThisOp, std::cerr);
  std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "(";

  if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) {
    std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0);
    PrintMask(ShufTab[ThisOp].Arg0, std::cerr);
  } else {
    // Figure out what tmp # it is.
    for (unsigned i = 0; ; ++i)
      if (Vals[i] == ShufTab[ThisOp].Arg0) {
        std::cerr << "t" << i;
        break;
      }
  }

  if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) {
    std::cerr << ", ";
    if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) {
      std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1);
      PrintMask(ShufTab[ThisOp].Arg1, std::cerr);
    } else {
      // Figure out what tmp # it is.
      for (unsigned i = 0; ; ++i)
        if (Vals[i] == ShufTab[ThisOp].Arg1) {
          std::cerr << "t" << i;
          break;
        }
    }
  }
  std::cerr << ")  ";
}

static unsigned getNumEntered() {
  unsigned Count = 0;
  for (unsigned i = 0; i != 65536; ++i)
    Count += ShufTab[i].Cost < 100;
  return Count;
}

static void EvaluateOps(unsigned short Elt, unsigned short Vals[],
                        unsigned &NumVals) {
  if (ShufTab[Elt].Cost == 0) return;

  // If this value has already been evaluated, it is free.  FIXME: match undefs.
  for (unsigned i = 0, e = NumVals; i != e; ++i)
    if (Vals[i] == Elt) return;

  // Otherwise, get the operands of the value, then add it.
  unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1;
  if (ShufTab[Arg0].Cost)
    EvaluateOps(Arg0, Vals, NumVals);
  if (Arg0 != Arg1 && ShufTab[Arg1].Cost)
    EvaluateOps(Arg1, Vals, NumVals);

  Vals[NumVals++] = Elt;
}


int main() {
  // Seed the table with accesses to the LHS and RHS.
  ShufTab[0x0123].Cost = 0;
  ShufTab[0x0123].Op = 0;
  ShufTab[0x0123].Arg0 = 0x0123;
  ShufTab[0x4567].Cost = 0;
  ShufTab[0x4567].Op = 0;
  ShufTab[0x4567].Arg0 = 0x4567;

  // Seed the first-level of shuffles, shuffles whose inputs are the input to
  // the vectorshuffle operation.
  bool MadeChange = true;
  unsigned OpCount = 0;
  while (MadeChange) {
    MadeChange = false;
    ++OpCount;
    std::cerr << "Starting iteration #" << OpCount << " with "
              << getNumEntered() << " entries established.\n";

    // Scan the table for two reasons: First, compute the maximum cost of any
    // operation left in the table.  Second, make sure that values with undefs
    // have the cheapest alternative that they match.
    unsigned MaxCost = ShufTab[0].Cost;
    for (unsigned i = 1; i != 0x8889; ++i) {
      if (!isValidMask(i)) continue;
      if (ShufTab[i].Cost > MaxCost)
        MaxCost = ShufTab[i].Cost;

      // If this value has an undef, make it be computed the cheapest possible
      // way of any of the things that it matches.
      if (hasUndefElements(i)) {
        // This code is a little bit tricky, so here's the idea: consider some
        // permutation, like 7u4u.  To compute the lowest cost for 7u4u, we
        // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries.  If
        // there are 3 undefs, the number rises to 729 entries we have to scan,
        // and for the 4 undef case, we have to scan the whole table.
        //
        // Instead of doing this huge amount of scanning, we process the table
        // entries *in order*, and use the fact that 'u' is 8, larger than any
        // valid index.  Given an entry like 7u4u then, we only need to scan
        // 7[0-7]4u - 8 entries.  We can get away with this, because we already
        // know that each of 704u, 714u, 724u, etc contain the minimum value of
        // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
        unsigned UndefIdx;
        if (i & 0x8000)
          UndefIdx = 0;
        else if (i & 0x0800)
          UndefIdx = 1;
        else if (i & 0x0080)
          UndefIdx = 2;
        else if (i & 0x0008)
          UndefIdx = 3;
        else
          abort();

        unsigned MinVal  = i;
        unsigned MinCost = ShufTab[i].Cost;

        // Scan the 8 entries.
        for (unsigned j = 0; j != 8; ++j) {
          unsigned NewElt = setMaskElt(i, UndefIdx, j);
          if (ShufTab[NewElt].Cost < MinCost) {
            MinCost = ShufTab[NewElt].Cost;
            MinVal = NewElt;
          }
        }

        // If we found something cheaper than what was here before, use it.
        if (i != MinVal) {
          MadeChange = true;
          ShufTab[i] = ShufTab[MinVal];
        }
      }
    }

    for (unsigned LHS = 0; LHS != 0x8889; ++LHS) {
      if (!isValidMask(LHS)) continue;
      if (ShufTab[LHS].Cost > 1000) continue;

      // If nothing involving this operand could possibly be cheaper than what
      // we already have, don't consider it.
      if (ShufTab[LHS].Cost + 1 >= MaxCost)
        continue;

      for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) {
        Operator *Op = TheOperators[opnum];

        // Evaluate op(LHS,LHS)
        unsigned ResultMask = Op->getTransformedMask(LHS, LHS);

        unsigned Cost = ShufTab[LHS].Cost + Op->getCost();
        if (Cost < ShufTab[ResultMask].Cost) {
          ShufTab[ResultMask].Cost = Cost;
          ShufTab[ResultMask].Op = Op;
          ShufTab[ResultMask].Arg0 = LHS;
          ShufTab[ResultMask].Arg1 = LHS;
          MadeChange = true;
        }

        // If this is a two input instruction, include the op(x,y) cases.  If
        // this is a one input instruction, skip this.
        if (Op->isOnlyLHSOperator()) continue;

        for (unsigned RHS = 0; RHS != 0x8889; ++RHS) {
          if (!isValidMask(RHS)) continue;
          if (ShufTab[RHS].Cost > 1000) continue;

          // If nothing involving this operand could possibly be cheaper than
          // what we already have, don't consider it.
          if (ShufTab[RHS].Cost + 1 >= MaxCost)
            continue;


          // Evaluate op(LHS,RHS)
          unsigned ResultMask = Op->getTransformedMask(LHS, RHS);

          if (ShufTab[ResultMask].Cost <= OpCount ||
              ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost ||
              ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost)
            continue;

          // Figure out the cost to evaluate this, knowing that CSE's only need
          // to be evaluated once.
          unsigned short Vals[30];
          unsigned NumVals = 0;
          EvaluateOps(LHS, Vals, NumVals);
          EvaluateOps(RHS, Vals, NumVals);

          unsigned Cost = NumVals + Op->getCost();
          if (Cost < ShufTab[ResultMask].Cost) {
            ShufTab[ResultMask].Cost = Cost;
            ShufTab[ResultMask].Op = Op;
            ShufTab[ResultMask].Arg0 = LHS;
            ShufTab[ResultMask].Arg1 = RHS;
            MadeChange = true;
          }
        }
      }
    }
  }

  std::cerr << "Finished Table has " << getNumEntered()
            << " entries established.\n";

  unsigned CostArray[10] = { 0 };

  // Compute a cost histogram.
  for (unsigned i = 0; i != 65536; ++i) {
    if (!isValidMask(i)) continue;
    if (ShufTab[i].Cost > 9)
      ++CostArray[9];
    else
      ++CostArray[ShufTab[i].Cost];
  }

  for (unsigned i = 0; i != 9; ++i)
    if (CostArray[i])
      std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n";
  if (CostArray[9])
    std::cout << "// " << CostArray[9] << " entries have higher cost!\n";


  // Build up the table to emit.
  std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n";
  std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n";

  for (unsigned i = 0; i != 0x8889; ++i) {
    if (!isValidMask(i)) continue;

    // CostSat - The cost of this operation saturated to two bits.
    unsigned CostSat = ShufTab[i].Cost;
    if (CostSat > 4) CostSat = 4;
    if (CostSat == 0) CostSat = 1;
    --CostSat;  // Cost is now between 0-3.

    unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0;
    assert(OpNum < 16 && "Too few bits to encode operation!");

    unsigned LHS = getCompressedMask(ShufTab[i].Arg0);
    unsigned RHS = getCompressedMask(ShufTab[i].Arg1);

    // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
    // LHS, and 13 bits of RHS = 32 bits.
    unsigned Val = (CostSat << 30) | (OpNum << 26) | (LHS << 13) | RHS;

    std::cout << "  " << std::setw(10) << Val << "U, // ";
    PrintMask(i, std::cout);
    std::cout << ": Cost " << ShufTab[i].Cost;
    std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy");
    std::cout << " ";
    if (ShufTab[ShufTab[i].Arg0].Cost == 0) {
      std::cout << getZeroCostOpName(ShufTab[i].Arg0);
    } else {
      PrintMask(ShufTab[i].Arg0, std::cout);
    }

    if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) {
      std::cout << ", ";
      if (ShufTab[ShufTab[i].Arg1].Cost == 0) {
        std::cout << getZeroCostOpName(ShufTab[i].Arg1);
      } else {
        PrintMask(ShufTab[i].Arg1, std::cout);
      }
    }
    std::cout << "\n";
  }
  std::cout << "  0\n};\n";

  if (0) {
    // Print out the table.
    for (unsigned i = 0; i != 0x8889; ++i) {
      if (!isValidMask(i)) continue;
      if (ShufTab[i].Cost < 1000) {
        PrintMask(i, std::cerr);
        std::cerr << " - Cost " << ShufTab[i].Cost << " - ";

        unsigned short Vals[30];
        unsigned NumVals = 0;
        EvaluateOps(i, Vals, NumVals);

        for (unsigned j = 0, e = NumVals; j != e; ++j)
          PrintOperation(j, Vals);
        std::cerr << "\n";
      }
    }
  }
}


#ifdef GENERATE_ALTIVEC

///===---------------------------------------------------------------------===//
/// The altivec instruction definitions.  This is the altivec-specific part of
/// this file.
///===---------------------------------------------------------------------===//

// Note that the opcode numbers here must match those in the PPC backend.
enum {
  OP_COPY = 0,   // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
  OP_VMRGHW,
  OP_VMRGLW,
  OP_VSPLTISW0,
  OP_VSPLTISW1,
  OP_VSPLTISW2,
  OP_VSPLTISW3,
  OP_VSLDOI4,
  OP_VSLDOI8,
  OP_VSLDOI12
};

struct vmrghw : public Operator {
  vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW) {}
} the_vmrghw;

struct vmrglw : public Operator {
  vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW) {}
} the_vmrglw;

template<unsigned Elt>
struct vspltisw : public Operator {
  vspltisw(const char *N, unsigned Opc)
    : Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
};

vspltisw<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0);
vspltisw<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1);
vspltisw<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2);
vspltisw<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3);

template<unsigned N>
struct vsldoi : public Operator {
  vsldoi(const char *Name, unsigned Opc)
    : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
  }
};

vsldoi<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4);
vsldoi<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8);
vsldoi<3> the_vsldoi3("vsldoi12", OP_VSLDOI12);

#endif

#define GENERATE_NEON

#ifdef GENERATE_NEON
enum {
  OP_COPY = 0,   // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
  OP_VREV,
  OP_VDUP0,
  OP_VDUP1,
  OP_VDUP2,
  OP_VDUP3,
  OP_VEXT1,
  OP_VEXT2,
  OP_VEXT3,
  OP_VUZPL, // VUZP, left result
  OP_VUZPR, // VUZP, right result
  OP_VZIPL, // VZIP, left result
  OP_VZIPR, // VZIP, right result
  OP_VTRNL, // VTRN, left result
  OP_VTRNR  // VTRN, right result
};

struct vrev : public Operator {
  vrev() : Operator(0x1032, "vrev", OP_VREV) {}
} the_vrev;

template<unsigned Elt>
struct vdup : public Operator {
  vdup(const char *N, unsigned Opc)
    : Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
};

vdup<0> the_vdup0("vdup0", OP_VDUP0);
vdup<1> the_vdup1("vdup1", OP_VDUP1);
vdup<2> the_vdup2("vdup2", OP_VDUP2);
vdup<3> the_vdup3("vdup3", OP_VDUP3);

template<unsigned N>
struct vext : public Operator {
  vext(const char *Name, unsigned Opc)
    : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
  }
};

vext<1> the_vext1("vext1", OP_VEXT1);
vext<2> the_vext2("vext2", OP_VEXT2);
vext<3> the_vext3("vext3", OP_VEXT3);

struct vuzpl : public Operator {
  vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL, 2) {}
} the_vuzpl;

struct vuzpr : public Operator {
  vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR, 2) {}
} the_vuzpr;

struct vzipl : public Operator {
  vzipl() : Operator(0x0415, "vzipl", OP_VZIPL, 2) {}
} the_vzipl;

struct vzipr : public Operator {
  vzipr() : Operator(0x2637, "vzipr", OP_VZIPR, 2) {}
} the_vzipr;

struct vtrnl : public Operator {
  vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL, 2) {}
} the_vtrnl;

struct vtrnr : public Operator {
  vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR, 2) {}
} the_vtrnr;

#endif