Theoretically you are right and that is how it should behave. The moment your program uses more than 5 bytes, this could lead to undefined behaviour. But often the stack pointer is aligned to certain boundaries for various performance reasons. Alignment varies from architecture to architecture. Hence you do not see this problem for every input which is larger than 5.
The disassembly of your program is displayed below. Check out the sub $0x20,%rsp
instruction, which allocates 16 bytes of memory on the stack for this function.
(gdb) disassemble main
Dump of assembler code for function main(int, char**):
0x00000000004008b0 <+0>: push %rbp
0x00000000004008b1 <+1>: mov %rsp,%rbp
=> 0x00000000004008b4 <+4>: sub $0x20,%rsp
0x00000000004008b8 <+8>: mov %edi,-0x14(%rbp)
0x00000000004008bb <+11>: mov %rsi,-0x20(%rbp)
0x00000000004008bf <+15>: mov %fs:0x28,%rax
0x00000000004008c8 <+24>: mov %rax,-0x8(%rbp)
0x00000000004008cc <+28>: xor %eax,%eax
0x00000000004008ce <+30>: addq $0x8,-0x20(%rbp)
0x00000000004008d3 <+35>: mov -0x20(%rbp),%rax
0x00000000004008d7 <+39>: mov (%rax),%rdx
0x00000000004008da <+42>: lea -0x10(%rbp),%rax
0x00000000004008de <+46>: mov %rdx,%rsi
0x00000000004008e1 <+49>: mov %rax,%rdi
0x00000000004008e4 <+52>: callq 0x400770 <strcpy@plt>
0x00000000004008e9 <+57>: lea -0x10(%rbp),%rax
0x00000000004008ed <+61>: mov %rax,%rdi
0x00000000004008f0 <+64>: callq 0x400710 <puts@plt>
0x00000000004008f5 <+69>: mov $0x0,%eax
0x00000000004008fa <+74>: mov -0x8(%rbp),%rcx
0x00000000004008fe <+78>: xor %fs:0x28,%rcx
0x0000000000400907 <+87>: je 0x400918 <main(int, char**)+104>
0x0000000000400909 <+89>: jmp 0x400913 <main(int, char**)+99>
0x000000000040090b <+91>: mov %rax,%rdi
0x000000000040090e <+94>: callq 0x400790 <_Unwind_Resume@plt>
0x0000000000400913 <+99>: callq 0x400760 <__stack_chk_fail@plt>
0x0000000000400918 <+104>: leaveq
0x0000000000400919 <+105>: retq
-fmudflap
or Valgrind. – Scyphus