gcc 4.3.2 vectorizes access to volatile array

Mon Jun 22 20:03:07 UTC 2009

That's roughly the same that 4.3.3 produces.
I had not quoted the full assembly code but just
the essential part that is executed when
source and destination are 4-byte aligned
and are more than 4-bytes apart.
Otherwise (not longword-aligned) the
(correct) code labeled '.L5' is executed.

-- Till

Andrew Haley wrote:
> H.J. Lu wrote:
>   
>> On Mon, Jun 22, 2009 at 11:14 AM, Till
>> Straumann<strauman at slac.stanford.edu> wrote:
>>     
>>> Andrew Haley wrote:
>>>       
>>>> Till Straumann wrote:
>>>>
>>>>         
>>>>> gcc-4.3.2 seems to produce bad code when
>>>>> accessing an array of small 'volatile'
>>>>> objects -- it may try to access multiple
>>>>> such objects in a 'parallel' fashion.
>>>>> E.g., instead of reading two consecutive
>>>>> 'volatile short's sequentially it reads
>>>>> a single 32-bit longword. This may crash
>>>>> e.g., when accessing a memory-mapped device
>>>>> which allows only 16-bit accesses.
>>>>>
>>>>> If I compile this code fragment
>>>>>
>>>>> void volarrcpy(short *d, volatile short *s, int n)
>>>>> {
>>>>> int i;
>>>>>  for (i=0; i<n; i++)
>>>>>   d[i] = s[i];
>>>>> }
>>>>>
>>>>>
>>>>> with '-O3' (the critical option seems to be '-ftree-vectorize')
>>>>> then gcc-4.3.2 produces quite complicated code
>>>>> but the essential section is (powerpc)
>>>>>
>>>>> .L7:
>>>>>   lhz 0,0(11)
>>>>>   addi 11,11,2
>>>>>   lwzx 0,4,9
>>>>>   stwx 0,3,9
>>>>>   addi 9,9,4
>>>>>   bdnz .L7
>>>>>
>>>>> or i386
>>>>>
>>>>> .L7:
>>>>>   movw    (%ecx), %ax
>>>>>   movl    (%esi,%edx,4), %eax
>>>>>   movl    %eax, (%ebx,%edx,4)
>>>>>   incl    %edx
>>>>>   addl    $2, %ecx
>>>>>   cmpl    %edx, -20(%ebp)
>>>>>   ja  .L7
>>>>>
>>>>>
>>>>> Disassembled back into C-code, this reads
>>>>>
>>>>> uint32_t *dst_l = (uint32_t*)d;
>>>>> uint32_t *src_l = (uint32_t*)s;
>>>>>
>>>>> for (i=0; i<n/2; i++) {
>>>>>   d[i]     = s[i];
>>>>>   dst_l[i] = src_l[i];
>>>>> }
>>>>>
>>>>> This code seems neither optimal nor correct.
>>>>> Besides reading half of the locations twice
>>>>> which violates the semantics of volatile
>>>>> objects accessing such objects in a 'vectorized'
>>>>> way (in this case: instead of reading
>>>>> two adjacent short addresses gcc emits
>>>>> a single 32-bit read) seems illegal to me.
>>>>>
>>>>> Similar behavior seems to be present in 4.3.3.
>>>>>
>>>>> Does anybody have some insight? Should I file
>>>>> a bug report?
>>>>>
>>>>>           
>>>> I can't reproduce this with "GCC: (GNU) 4.3.3 20081110 (prerelease)"
>>>>
>>>> .L8:
>>>>        movzwl  (%ecx), %eax
>>>>        addl    $1, %ebx
>>>>        addl    $2, %ecx
>>>>        movw    %ax, (%edx)
>>>>        addl    $2, %edx
>>>>        cmpl    %ebx, 16(%ebp)
>>>>        jg      .L8
>>>>
>>>> I think you should upgrade.
>>>>
>>>> Andrew.
>>>>
>>>>         
>>> OK, try this then:
>>>
>>> void
>>> c(char *d, volatile char *s)
>>> {
>>> int i;
>>>   for ( i=0; i<32; i++ )
>>>       d[i]=s[i];
>>> }
>>>
>>>
>>> (gcc --version: gcc (Ubuntu 4.3.3-5ubuntu4) 4.3.3)
>>>       
>>                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>
>> That may be too old.  Gcc 4.3.4 revision 148680
>> generates:
>>
>> .L5:
>> 	leaq	(%rsi,%rdx), %rax
>> 	movzbl	(%rax), %eax
>> 	movb	%al, (%rdi,%rdx)
>> 	addq	$1, %rdx
>> 	cmpq	$32, %rdx
>> 	jne	.L5
>>     
>
> 4.4.0 20090307 generates truly bizarre code, though:
>
> gcc -m32 -c -S -O3  x.c
>
> c:
> 	pushl	%ebp
> 	movl	%esp, %ebp
> 	pushl	%ebx
> 	movl	12(%ebp), %edx
> 	movl	8(%ebp), %ebx
> 	movl	%edx, %ecx
> 	orl	%ebx, %ecx
> 	andl	$3, %ecx
> 	leal	4(%ebx), %eax
> 	je	.L10
> .L2:
> 	xorl	%eax, %eax
> 	.p2align 4,,7
> 	.p2align 3
> .L5:
> 	leal	(%edx,%eax), %ecx
> 	movzbl	(%ecx), %ecx
> 	movb	%cl, (%ebx,%eax)
> 	addl	$1, %eax
> 	cmpl	$32, %eax
> 	jne	.L5
> 	popl	%ebx
> 	popl	%ebp
> 	ret
> 	.p2align 4,,7
> 	.p2align 3
> .L10:
> 	leal	4(%edx), %ecx
> 	cmpl	%ecx, %ebx
> 	jbe	.L11
> .L7:
> 	movzbl	(%edx), %ecx
> 	movl	(%edx), %ecx
> 	movl	%ecx, (%ebx)
> 	movzbl	1(%edx), %ecx
> 	movl	4(%edx), %ecx
> 	movl	%ecx, 4(%ebx)
> 	movzbl	2(%edx), %ecx
> 	movl	8(%edx), %ecx
> 	movl	%ecx, 4(%eax)
> 	movzbl	3(%edx), %ecx
> 	movl	12(%edx), %ecx
> 	movl	%ecx, 8(%eax)
> 	movzbl	4(%edx), %ecx
> 	movl	16(%edx), %ecx
> 	movl	%ecx, 12(%eax)
> 	movzbl	5(%edx), %ecx
> 	movl	20(%edx), %ecx
> 	movl	%ecx, 16(%eax)
> 	movzbl	6(%edx), %ebx
> 	leal	24(%edx), %ecx
> 	movl	24(%edx), %ebx
> 	movl	%ebx, 20(%eax)
> 	movzbl	7(%edx), %edx
> 	movl	4(%ecx), %edx
> 	movl	%edx, 24(%eax)
> 	popl	%ebx
> 	popl	%ebp
> 	ret
> 	.p2align 4,,7
> 	.p2align 3
> .L11:
> 	cmpl	%edx, %eax
> 	jae	.L2
> 	jmp	.L7
>