[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAG_fn=VrPJj6YowHNki5RGAAs8qvwZpUVN4K9qw=cf4aW7Qw9A@mail.gmail.com>
Date: Fri, 4 Aug 2023 18:07:00 +0200
From: Alexander Potapenko <glider@...gle.com>
To: Yury Norov <yury.norov@...il.com>
Cc: catalin.marinas@....com, will@...nel.org, pcc@...gle.com,
andreyknvl@...il.com, andriy.shevchenko@...ux.intel.com,
linux@...musvillemoes.dk, linux-kernel@...r.kernel.org,
linux-arm-kernel@...ts.infradead.org, eugenis@...gle.com,
syednwaris@...il.com, william.gray@...aro.org,
Arnd Bergmann <arnd@...db.de>
Subject: Re: [PATCH v4 1/5] lib/bitmap: add bitmap_{set,get}_value()
> space >= nbits <=>
> BITS_PER_LONG - offset >= nbits <=>
> offset + nbits <= BITS_PER_LONG
>
> > map[index] &= (fit ? (~(GENMASK(nbits - 1, 0) << offset)) :
>
> So here GENMASK(nbits + offset - 1, offset) is at max:
> GENMASK(BITS_PER_LONG - 1, offset). And it never overflows, which is my
> point. Does it make sense?
It indeed does. Perhaps pulling offset inside GENMASK is not a bug
after all (a simple test does not show any difference between their
behavior.
But `GENMASK(nbits - 1 + offset, offset)` blows up the code (see below).
My guess is that this happens because the compiler fails to reuse the
value of `GENMASK(nbits - 1, 0)` used to clamp the value to write, and
calculates `GENMASK(nbits - 1 + offset, offset)` from scratch.
>
> > ~BITMAP_FIRST_WORD_MASK(start));
>
> As I said, ~BITMAP_FIRST_WORD_MASK() is the same as BITMAP_LAST_WORD_MASK()
> and vise-versa.
Surprisingly, ~BITMAP_FIRST_WORD_MASK() generates better code than
BITMAP_LAST_WORD_MASK().
>
> > map[index] |= value << offset;
> > if (fit)
> > return;
> >
> > map[index + 1] &= ~BITMAP_LAST_WORD_MASK(start + nbits);
OTOH I managed to shave three more bytes off by replacing
~BITMAP_LAST_WORD_MASK with a BITMAP_FIRST_WORD_MASK here.
> > map[index + 1] |= (value >> space);
> > }
I'll post the implementations together with the disassembly below.
I used some Clang 17.0.0 version that is a couple months behind
upstream, but that still produces sustainably shorter code (~48 bytes
less) than the trunk GCC on Godbolt.
1. Original implementation of bitmap_write() from this patch - 164
bytes (interestingly, it's 157 bytes with Clang 14.0.6)
==================================================================
void bitmap_write(unsigned long *map, unsigned long value,
unsigned long start, unsigned long nbits)
{
size_t index = BIT_WORD(start);
unsigned long offset = start % BITS_PER_LONG;
unsigned long space = BITS_PER_LONG - offset;
if (unlikely(!nbits))
return;
value &= GENMASK(nbits - 1, 0);
if (space >= nbits) {
map[index] &= ~(GENMASK(nbits - 1, 0) << offset);
map[index] |= value << offset;
return;
}
map[index] &= ~BITMAP_FIRST_WORD_MASK(start);
map[index] |= value << offset;
map[index + 1] &= ~BITMAP_LAST_WORD_MASK(start + nbits);
map[index + 1] |= (value >> space);
}
==================================================================
0000000000001200 <bitmap_write>:
1200: 41 57 push %r15
1202: 41 56 push %r14
1204: 53 push %rbx
1205: 48 85 c9 test %rcx,%rcx
1208: 74 7b je 1285 <bitmap_write+0x85>
120a: 48 89 c8 mov %rcx,%rax
120d: 49 89 d2 mov %rdx,%r10
1210: 49 c1 ea 06 shr $0x6,%r10
1214: 41 89 d1 mov %edx,%r9d
1217: f6 d9 neg %cl
1219: 49 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%r15
1220: 49 d3 ef shr %cl,%r15
1223: 41 83 e1 3f and $0x3f,%r9d
1227: 41 b8 40 00 00 00 mov $0x40,%r8d
122d: 4c 21 fe and %r15,%rsi
1230: 48 89 f3 mov %rsi,%rbx
1233: 44 89 c9 mov %r9d,%ecx
1236: 48 d3 e3 shl %cl,%rbx
1239: 4d 29 c8 sub %r9,%r8
123c: 49 c7 c3 ff ff ff ff mov $0xffffffffffffffff,%r11
1243: 4e 8b 34 d7 mov (%rdi,%r10,8),%r14
1247: 49 39 c0 cmp %rax,%r8
124a: 73 3f jae 128b <bitmap_write+0x8b>
124c: 49 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%r15
1253: 44 89 c9 mov %r9d,%ecx
1256: 49 d3 e7 shl %cl,%r15
1259: 49 f7 d7 not %r15
125c: 4d 21 fe and %r15,%r14
125f: 49 09 de or %rbx,%r14
1262: 4e 89 34 d7 mov %r14,(%rdi,%r10,8)
1266: 01 c2 add %eax,%edx
1268: f6 da neg %dl
126a: 89 d1 mov %edx,%ecx
126c: 49 d3 eb shr %cl,%r11
126f: 49 f7 d3 not %r11
1272: 44 89 c1 mov %r8d,%ecx
1275: 48 d3 ee shr %cl,%rsi
1278: 4e 23 5c d7 08 and 0x8(%rdi,%r10,8),%r11
127d: 4c 09 de or %r11,%rsi
1280: 4a 89 74 d7 08 mov %rsi,0x8(%rdi,%r10,8)
1285: 5b pop %rbx
1286: 41 5e pop %r14
1288: 41 5f pop %r15
128a: c3 ret
128b: 44 89 c9 mov %r9d,%ecx
128e: 49 d3 e7 shl %cl,%r15
1291: 49 f7 d7 not %r15
1294: 4d 21 fe and %r15,%r14
1297: 49 09 de or %rbx,%r14
129a: 4e 89 34 d7 mov %r14,(%rdi,%r10,8)
129e: 5b pop %rbx
129f: 41 5e pop %r14
12a1: 41 5f pop %r15
12a3: c3 ret
==================================================================
2. Your implementation, my_bitmap_write() - 152 bytes (used to be
slightly worse with Clang 14.0.6 - 159 bytes):
==================================================================
void my_bitmap_write(unsigned long *map, unsigned long value,
unsigned long start, unsigned long nbits)
{
unsigned long w, end;
if (unlikely(nbits == 0))
return;
value &= GENMASK(nbits - 1, 0);
map += BIT_WORD(start);
start %= BITS_PER_LONG;
end = start + nbits - 1;
w = *map & (end < BITS_PER_LONG ? ~GENMASK(end, start) :
BITMAP_LAST_WORD_MASK(start));
*map = w | (value << start);
if (end < BITS_PER_LONG)
return;
w = *++map & BITMAP_LAST_WORD_MASK(end + 1 - BITS_PER_LONG);
*map = w | (value >> (BITS_PER_LONG - start));
}
==================================================================
0000000000001160 <my_bitmap_write>:
1160: 48 85 c9 test %rcx,%rcx
1163: 0f 84 8e 00 00 00 je 11f7 <my_bitmap_write+0x97>
1169: 49 89 c9 mov %rcx,%r9
116c: f6 d9 neg %cl
116e: 48 d3 e6 shl %cl,%rsi
1171: 48 d3 ee shr %cl,%rsi
1174: 49 89 d2 mov %rdx,%r10
1177: 49 c1 ea 06 shr $0x6,%r10
117b: 89 d0 mov %edx,%eax
117d: 83 e0 3f and $0x3f,%eax
1180: 4e 8d 04 08 lea (%rax,%r9,1),%r8
1184: 4a 8d 0c 08 lea (%rax,%r9,1),%rcx
1188: 48 ff c9 dec %rcx
118b: 4e 8b 0c d7 mov (%rdi,%r10,8),%r9
118f: 48 83 f9 3f cmp $0x3f,%rcx
1193: 77 29 ja 11be <my_bitmap_write+0x5e>
1195: 41 f6 d8 neg %r8b
1198: 48 c7 c2 ff ff ff ff mov $0xffffffffffffffff,%rdx
119f: 44 89 c1 mov %r8d,%ecx
11a2: 48 d3 ea shr %cl,%rdx
11a5: 89 c1 mov %eax,%ecx
11a7: 48 d3 ea shr %cl,%rdx
11aa: 48 d3 e2 shl %cl,%rdx
11ad: 48 d3 e6 shl %cl,%rsi
11b0: 48 f7 d2 not %rdx
11b3: 49 21 d1 and %rdx,%r9
11b6: 4c 09 ce or %r9,%rsi
11b9: 4a 89 34 d7 mov %rsi,(%rdi,%r10,8)
11bd: c3 ret
11be: f6 da neg %dl
11c0: 89 d1 mov %edx,%ecx
11c2: 49 d3 e1 shl %cl,%r9
11c5: 49 d3 e9 shr %cl,%r9
11c8: 48 89 f2 mov %rsi,%rdx
11cb: 89 c1 mov %eax,%ecx
11cd: 48 d3 e2 shl %cl,%rdx
11d0: 4c 09 ca or %r9,%rdx
11d3: 4a 89 14 d7 mov %rdx,(%rdi,%r10,8)
11d7: 4a 8b 54 d7 08 mov 0x8(%rdi,%r10,8),%rdx
11dc: 41 f6 d8 neg %r8b
11df: 44 89 c1 mov %r8d,%ecx
11e2: 48 d3 e2 shl %cl,%rdx
11e5: 48 d3 ea shr %cl,%rdx
11e8: f6 d8 neg %al
11ea: 89 c1 mov %eax,%ecx
11ec: 48 d3 ee shr %cl,%rsi
11ef: 48 09 d6 or %rdx,%rsi
11f2: 4a 89 74 d7 08 mov %rsi,0x8(%rdi,%r10,8)
11f7: c3 ret
==================================================================
3. My improved version built on top of yours and mentioned above under
the name bitmap_write_new() - 116 bytes:
==================================================================
void bitmap_write_new(unsigned long *map, unsigned long value,
unsigned long start, unsigned long nbits)
{
unsigned long offset;
unsigned long space;
size_t index;
bool fit;
if (unlikely(!nbits))
return;
value &= GENMASK(nbits - 1, 0);
offset = start % BITS_PER_LONG;
space = BITS_PER_LONG - offset;
index = BIT_WORD(start);
fit = space >= nbits;
map[index] &= (fit ? (~(GENMASK(nbits - 1, 0) << offset)) :
~BITMAP_FIRST_WORD_MASK(start));
map[index] |= value << offset;
if (fit)
return;
map[index + 1] &= BITMAP_FIRST_WORD_MASK(start + nbits);
map[index + 1] |= (value >> space);
}
==================================================================
00000000000012b0 <bitmap_write_new>:
12b0: 48 85 c9 test %rcx,%rcx
12b3: 74 6e je 1323 <bitmap_write_new+0x73>
12b5: 48 89 c8 mov %rcx,%rax
12b8: f6 d9 neg %cl
12ba: 49 c7 c2 ff ff ff ff mov $0xffffffffffffffff,%r10
12c1: 49 d3 ea shr %cl,%r10
12c4: 49 c7 c3 ff ff ff ff mov $0xffffffffffffffff,%r11
12cb: 4c 21 d6 and %r10,%rsi
12ce: 89 d1 mov %edx,%ecx
12d0: 83 e1 3f and $0x3f,%ecx
12d3: 41 b8 40 00 00 00 mov $0x40,%r8d
12d9: 49 29 c8 sub %rcx,%r8
12dc: 49 89 d1 mov %rdx,%r9
12df: 49 c1 e9 06 shr $0x6,%r9
12e3: 49 39 c0 cmp %rax,%r8
12e6: 4d 0f 42 d3 cmovb %r11,%r10
12ea: 49 d3 e2 shl %cl,%r10
12ed: 49 f7 d2 not %r10
12f0: 4e 23 14 cf and (%rdi,%r9,8),%r10
12f4: 49 89 f3 mov %rsi,%r11
12f7: 49 d3 e3 shl %cl,%r11
12fa: 4d 09 d3 or %r10,%r11
12fd: 4e 89 1c cf mov %r11,(%rdi,%r9,8)
1301: 49 39 c0 cmp %rax,%r8
1304: 73 1d jae 1323 <bitmap_write_new+0x73>
1306: 01 d0 add %edx,%eax
1308: 4a 8b 54 cf 08 mov 0x8(%rdi,%r9,8),%rdx
130d: 89 c1 mov %eax,%ecx
130f: 48 d3 ea shr %cl,%rdx
1312: 48 d3 e2 shl %cl,%rdx
1315: 44 89 c1 mov %r8d,%ecx
1318: 48 d3 ee shr %cl,%rsi
131b: 48 09 d6 or %rdx,%rsi
131e: 4a 89 74 cf 08 mov %rsi,0x8(%rdi,%r9,8)
1323: c3 ret
==================================================================
4. The version of bitmap_write_new() that uses `(GENMASK(nbits - 1 +
offset, offset)` - 146 bytes:
==================================================================
void bitmap_write_new_shift(unsigned long *map, unsigned long value,
unsigned long start, unsigned long nbits)
{
unsigned long offset;
unsigned long space;
size_t index;
bool fit;
if (unlikely(!nbits))
return;
value &= GENMASK(nbits - 1, 0);
offset = start % BITS_PER_LONG;
space = BITS_PER_LONG - offset;
index = BIT_WORD(start);
fit = space >= nbits;
map[index] &= (fit ? ~(GENMASK(nbits - 1 + offset, offset)) :
~BITMAP_FIRST_WORD_MASK(start));
map[index] |= value << offset;
if (fit)
return;
map[index + 1] &= BITMAP_FIRST_WORD_MASK(start + nbits);
map[index + 1] |= (value >> space);
}
==================================================================
0000000000001330 <bitmap_write_new_shift>:
1330: 48 85 c9 test %rcx,%rcx
1333: 74 6a je 139f <bitmap_write_new_shift+0x6f>
1335: 48 89 c8 mov %rcx,%rax
1338: f6 d9 neg %cl
133a: 48 d3 e6 shl %cl,%rsi
133d: 48 d3 ee shr %cl,%rsi
1340: 41 89 d0 mov %edx,%r8d
1343: 41 83 e0 3f and $0x3f,%r8d
1347: 41 b9 40 00 00 00 mov $0x40,%r9d
134d: 4d 29 c1 sub %r8,%r9
1350: 49 89 d2 mov %rdx,%r10
1353: 49 c1 ea 06 shr $0x6,%r10
1357: 49 c7 c3 ff ff ff ff mov $0xffffffffffffffff,%r11
135e: 44 89 c1 mov %r8d,%ecx
1361: 49 d3 e3 shl %cl,%r11
1364: 49 39 c1 cmp %rax,%r9
1367: 73 37 jae 13a0 <bitmap_write_new_shift+0x70>
1369: 53 push %rbx
136a: 49 f7 d3 not %r11
136d: 4e 23 1c d7 and (%rdi,%r10,8),%r11
1371: 48 89 f3 mov %rsi,%rbx
1374: 44 89 c1 mov %r8d,%ecx
1377: 48 d3 e3 shl %cl,%rbx
137a: 4c 09 db or %r11,%rbx
137d: 4a 89 1c d7 mov %rbx,(%rdi,%r10,8)
1381: 01 d0 add %edx,%eax
1383: 4a 8b 54 d7 08 mov 0x8(%rdi,%r10,8),%rdx
1388: 89 c1 mov %eax,%ecx
138a: 48 d3 ea shr %cl,%rdx
138d: 48 d3 e2 shl %cl,%rdx
1390: 44 89 c9 mov %r9d,%ecx
1393: 48 d3 ee shr %cl,%rsi
1396: 48 09 d6 or %rdx,%rsi
1399: 4a 89 74 d7 08 mov %rsi,0x8(%rdi,%r10,8)
139e: 5b pop %rbx
139f: c3 ret
13a0: 44 01 c0 add %r8d,%eax
13a3: f6 d8 neg %al
13a5: 89 c1 mov %eax,%ecx
13a7: 49 d3 e3 shl %cl,%r11
13aa: 49 d3 eb shr %cl,%r11
13ad: 49 f7 d3 not %r11
13b0: 44 89 c1 mov %r8d,%ecx
13b3: 48 d3 e6 shl %cl,%rsi
13b6: 4e 23 1c d7 and (%rdi,%r10,8),%r11
13ba: 4c 09 de or %r11,%rsi
13bd: 4a 89 34 d7 mov %rsi,(%rdi,%r10,8)
13c1: c3 ret
==================================================================
For posterity, I am also attaching the C file containing the four
implementations together with some code testing them.
PS. I won't be actively working on the patch series till the end of
August, so feel free to ignore this letter until then.
View attachment "bitmap_write.c" of type "text/x-csrc" (4039 bytes)
Powered by blists - more mailing lists