Here are the results sorted by number of masks calculated at the same time.
Each mask needs at most 7 operations if calculated separately:
a01 = b0 & b1
a23 = b2 & b3
r01 = b0 | b1
r23 = b2 | b3
m0 = ~(r01 | r23) // 4 ops
m1 = (a01 | r23) ^ (r01 | a23) // 7 ops
m2 = (r01 & r23) ^ (a01 | a23) // 7 ops
m3 = (r01 & r23) & (a01 ^ a23) // 7 ops
m4 = a01 & a23 // 3 ops
There are lots of common sub-expressions here, so if you need to know any pair of masks at once, you need at most 10 operations (less with m0
or m4
).
But calculation of some pairs may be optimized even further:
// m2,m3 in 9 ops
t1 = r01 & r23
t2 = a01 ^ a23
m2 = t1 ^ (a23 | t2)
m3 = t1 & t2
// m1,m3 in 9 ops
t1 = r01 ^ r23
t2 = a01 ^ a23
t3 = t1 ^ t2
m1 = t1 & t3
m3 = t2 & t3
Same approach works for mask triples. Only one triple (m1
,m2
,m3
) may be computed faster, in 11 operations, with "sort the bits" approach, which is optimal.
If you need 4 or 5 masks at once, I believe, "sort the bits" approach would give optimal result.
Some more optimizations are possible if we allow one more operation (NAND). Actually, last 3 lines of the last code snippet may be replaced with 2 NANDs:
// m1,m3 in 8 ops (NAND is a single op)
t1 = r01 ^ r23
t2 = a01 ^ a23
m1 = t1 & ~t2
m3 = ~t1 & t2
And (m1,m2,m3) triple also could be optimized with NANDs:
// m1,m2,m3 in 10 ops (NAND is a single op)
x01 = b0 ^ b1
x23 = b2 ^ b3
a01 = b0 & b1
a23 = b2 & b3
t1 = x01 ^ x23
t2 = a01 ^ a23
m1 = t1 & ~t2
m2 = ~t1 & (x23 ^ t2)
m3 = t1 & t2
Add one more operation m4 = a01 & a23
to get all masks except m0
in 11 operations.
These results where obtained with help of exhaustive search code (see below). This code uses some simplifying assumptions to be able to run fast enough. These assumptions are not obvious, which makes this code not a very good tool to prove optimality of the results. At least the results for separate masks are optimal, which is proven by code from other answer. Optimality of results for mask pairs and triples are "proven" by my code, which means that most likely you cannot do it faster.
Here is the code (C++14 or C++11 plus binary literals):
/* Search for minimal logical expression (using &|^ operations) for bits set
* exactly N times (in a group of 4 bits).
*
* Uses brute force approach to get one or two expressions for one or two
* values of N at once. To make it possible getting results in reasonable time
* some simplifications were made:
* - first 4 operations pre-defined: &| or ^& or ^| for 2 pairs of input values
* - number of ops limited, if no result found within limit, print "impossible"
* - no attempts to perform operation on 2 expr with the same left and the same
* right parts
* - unused nodes are not allowed (to minimize number of duplicate attempts)
* - no attempt to use "not" (except for "m0")
*
* Also these optimizations were tried (with no significant effect):
* - no more than 2 different ops on same pair of sub-expressions
* - do not apply same op on same pair of sub-expressions more than once
*
* operation set may be extended by "nand" (kNAnd option)
*/
#include <algorithm>
#include <array>
#include <iostream>
#include <bitset>
#include <thread>
#include <mutex>
#include <cassert>
using namespace std;
enum {
kMaxSize = 17,
kNTargets = 5,
kNInputs = 4,
kNil = 255,
kNAnd = 0,
};
enum Op {
OpAnd = kNInputs,
OpOr,
OpXor,
OpNAndL,
OpNAndR,
};
array<const char*, kNInputs + 3> g_op_str {
"b0", "b1", "b2", "b3",
" & ", " | ", " ^ ",
};
array<unsigned, kNTargets> g_target_masks {
0b0111111111111111, // gives correct result only after additional "not"
0b0111100000000000,
0b0000011111100000,
0b0000000000011110,
0b0000000000000001,
};
// 0111122222233334
array<unsigned, kNInputs> g_literal_vals {
0b0100011100001111,
0b0010010011010111,
0b0001001010111011,
0b0000100101111101,
};
unsigned g_targets = 0;
unsigned g_score_limit = 0;
mutex g_print_mutex;
template<typename C, typename T>
ptrdiff_t findIndex(C c, T t)
{
auto it = find(begin(c), end(c), t);
return it - begin(c);
}
struct DAGNode
{
unsigned value;
uint8_t op;
bool l_not;
bool r_not;
uint8_t left;
uint8_t right;
uint8_t use_cnt;
void clear()
{
use_cnt = 0;
}
void setLit(const uint8_t lit, const unsigned v)
{
value = v;
op = lit;
l_not = false;
r_not = false;
left = kNil;
right = kNil;
use_cnt = 0;
}
};
struct DAG
{
array<DAGNode, kMaxSize> nodes;
unsigned size;
unsigned score;
void print(ostream& out, size_t ind)
{
auto& node = nodes[ind];
if (node.op < kNInputs)
{
out << g_op_str[node.op];
}
else
{
out << '(';
if (node.l_not) out << '~';
print(out, node.left);
out << g_op_str[node.op];
if (node.r_not) out << '~';
print(out, node.right);
out << ')';
}
}
void printAll(ostream& out)
{
for (size_t i = 2 * kNInputs; i < size; ++i)
{
auto& node = nodes[i];
auto ind = findIndex(g_target_masks, node.value);
if ((1 << ind) & g_targets)
{
out << 'm' << static_cast<char>('0' + ind) << " = ";
print(out, i);
out << '\n';
}
}
}
};
bool operator < (const DAG& l, const DAG& r)
{
return l.score < r.score;
}
class Find
{
using SPA = array<uint8_t, (kMaxSize - kNInputs) * (kMaxSize - kNInputs)>;
using EDA = bitset<(kMaxSize - kNInputs) * (kMaxSize - kNInputs) * 5>;
SPA same_pair_;
EDA dup_op_;
DAG dag_;
DAG best_;
unsigned ops_;
unsigned targets_;
unsigned unused_;
class UseCnt
{
unsigned& unused_;
uint8_t& use_cnt_;
public:
UseCnt(unsigned& unused, uint8_t& use_cnt)
: unused_(unused)
, use_cnt_(use_cnt)
{
if (!use_cnt_)
--unused_;
++use_cnt_;
}
~UseCnt()
{
--use_cnt_;
if (!use_cnt_)
++unused_;
}
};
class PairLim
{
uint8_t& counter_;
public:
PairLim(SPA& spa, size_t l, size_t r)
: counter_(spa[(kMaxSize - kNInputs) * l + r])
{
++counter_;
}
bool exceeded()
{
return counter_ > 2;
}
~PairLim()
{
--counter_;
}
};
class DupLim
{
EDA& eda_;
size_t ind_;
public:
DupLim(EDA& eda, size_t l, size_t r, size_t op)
: eda_(eda)
, ind_(5 * ((kMaxSize - kNInputs) * l + r) + op - kNInputs)
{
eda_.flip(ind_);
}
bool used()
{
return !eda_.test(ind_);
}
~DupLim()
{
eda_.flip(ind_);
}
};
unsigned getPos(uint8_t l)
{
return dag_.nodes[l].value;
}
bool tryNode(uint8_t l, uint8_t r, uint8_t op)
{
//DupLim dl(dup_op_, l, r, op);
//if (dl.used())
// return false;
addNode(l, r, op);
auto node = dag_.nodes[dag_.size - 1];
const auto ind = findIndex(g_target_masks, node.value);
const auto m = (1 << ind) & targets_;
if (m)
{
++node.use_cnt;
--unused_;
if (targets_ == m)
{
best_ = dag_;
--dag_.size;
--dag_.score;
return true;
}
targets_ &= ~m;
}
search();
if (!m)
{
--unused_;
}
targets_ |= m;
--dag_.size;
--dag_.score;
return false;
}
public:
Find()
: ops_(kNInputs)
, targets_(g_targets)
, unused_(0)
{
dag_.score = 0;
dag_.size = kNInputs;
best_.score = g_score_limit;
best_.size = 0;
for (int i = 0; i < kNInputs; ++i)
dag_.nodes[i].setLit(static_cast<uint8_t>(i), g_literal_vals[i]);
fill(begin(same_pair_), end(same_pair_), 0);
}
void addNode(const uint8_t l, const uint8_t r, uint8_t op)
{
auto& node = dag_.nodes[dag_.size];
switch (op)
{
case OpAnd:
node.value = getPos(l) & getPos(r);
break;
case OpOr:
node.value = getPos(l) | getPos(r);
break;
case OpXor:
node.value = getPos(l) ^ getPos(r);
break;
case OpNAndL:
node.value = ~getPos(l) & getPos(r);
break;
case OpNAndR:
node.value = getPos(l) & ~getPos(r);
break;
default:
assert(false);
}
node.op = op;
node.l_not = false;
node.r_not = false;
node.left = l;
node.right = r;
node.use_cnt = 0;
if (op == OpNAndL)
{
node.l_not = true;
node.op = OpAnd;
}
else if (op == OpNAndR)
{
node.r_not = true;
node.op = OpAnd;
}
++dag_.size;
++dag_.score;
++unused_;
}
void search()
{
if (dag_.score >= best_.score)
return;
for (uint8_t i_r = kNTargets; i_r < dag_.size; ++i_r)
{
UseCnt uc_r(unused_, dag_.nodes[i_r].use_cnt);
if (unused_ > 2 * (best_.score - dag_.score) - 1)
continue;
for (uint8_t i_l = kNInputs; i_l < i_r; ++i_l)
{
UseCnt uc_l(unused_, dag_.nodes[i_l].use_cnt);
if (unused_ > 2 * (best_.score - dag_.score) - 2)
continue;
if (dag_.nodes[i_l].left == dag_.nodes[i_r].left &&
dag_.nodes[i_l].right == dag_.nodes[i_r].right
)
continue;
PairLim pl(same_pair_, i_l, i_r);
if (pl.exceeded())
continue;
if (tryNode(i_l, i_r, OpAnd))
return;
if (tryNode(i_l, i_r, OpOr))
return;
if (tryNode(i_l, i_r, OpXor))
return;
if (kNAnd)
{
if (tryNode(i_l, i_r, OpNAndL))
return;
if (tryNode(i_l, i_r, OpNAndR))
return;
}
}
}
}
void print(ostream& out, const char* name)
{
if (best_.score < g_score_limit)
{
out << name << " ops = " << best_.score << '\n';
best_.printAll(out);
out << '\n';
}
else
{
out << name << " impossible\n\n";
}
}
void process(ostream& out, const char* name)
{
search();
lock_guard<mutex> lk(g_print_mutex);
print(out, name);
}
};
unsigned readTargets(char* str)
{
unsigned num = 0;
for (; *str; ++str)
{
if (*str >= '0' && *str <= '4')
{
g_targets |= 1 << (*str - '0');
++num;
}
}
return num;
}
void usage()
{
cerr << "Usage: bitcnt [0-4]*\n"
"example: bitcnt 23 (to find targets m2,m3)\n";
exit(1);
}
int main(int argc, char **argv) {
if (argc > 1)
g_score_limit = 6 + 2 * readTargets(argv[1]);
else
usage();
// set score_limit to 10 for m1,m2,m3 (with nand), time ≈ 1h
array<Find, 3> finders;
finders[0].addNode(0, 1, OpAnd);
finders[0].addNode(0, 1, OpOr);
finders[0].addNode(2, 3, OpAnd);
finders[0].addNode(2, 3, OpOr);
finders[1].addNode(0, 1, OpAnd);
finders[1].addNode(0, 1, OpXor);
finders[1].addNode(2, 3, OpAnd);
finders[1].addNode(2, 3, OpXor);
finders[2].addNode(0, 1, OpXor);
finders[2].addNode(0, 1, OpOr);
finders[2].addNode(2, 3, OpXor);
finders[2].addNode(2, 3, OpOr);
auto t0 = thread([&finders]{finders[0].process(cout, "&|");});
auto t1 = thread([&finders]{finders[1].process(cout, "&^");});
auto t2 = thread([&finders]{finders[2].process(cout, "^|");});
t0.join(); t1.join(); t2.join();
}
First version of this answer is still correct but obsoleted by recent results:
m2
in 10 ops:
x1 = b1 ^ b2;
x2 = b3 ^ b4;
m2 = (x1 & x2) | (((b1 & b2) ^ (b3 & b4)) & ~(x1 | x2));
m1
in 8 ops:
m1 = (b1 ^ b2 ^ b3 ^ b4) & ~((b1 & b2) | (b3 & b4));
to get the masks of those bits that are set exactly 'n' of the given bitvectors
– Anhanhaltb1 ^ b2
). Is using temporary variables possible? – Lhasa