本帖最后由 andeyqi 于 2018-8-31 11:16 编辑
查看STM32f4标准库RTC代码时(STM32F4xx_DSP_StdPeriph_Lib_V1.8.0),发现RTC_ByteToBcd2函数书写的不够简洁,代码片段如下。- 02723 /**
- 02724 * @brief Converts a 2 digit decimal to BCD format.
- 02725 * @param Value: Byte to be converted.
- 02726 * @retval Converted byte
- 02727 */
- 02728 static uint8_t RTC_ByteToBcd2(uint8_t Value)
- 02729 {
- 02730 uint8_t bcdhigh = 0;
- 02731
- 02732 while (Value >= 10)
- 02733 {
- 02734 bcdhigh++;
- 02735 Value -= 10;
- 02736 }
- 02737
- 02738 return ((uint8_t)(bcdhigh << 4) | Value);
- 02739 }
复制代码 代码很容易理解,第一反应感觉代码写的不够简洁。效果和如下的代码是相同的。- static uint8_t RTC_ByteToBcd2(uint8_t Value)
- {
- return (((Value/10) << 4) + (Value %10));
- }
复制代码
后来就在想,为什么库函代码写的如此不够简洁,一定有自身的道理,仔细比较了上下两个函数的区别发现库函数规避了除法的使用,记得之前看过的编码规范上有要求能用移位操作来实现的运算就不要使用除法来实现,提高代码的运行效率,下面反汇编如上两个函数用来验证上述的想法。
- static uint8_t RTC_ByteToBcd2(uint8_t Value)
- {
- 8000108: b580 push {r7, lr}
- 800010a: b084 sub sp, #16
- 800010c: af00 add r7, sp, #0
- 800010e: 0002 movs r2, r0
- 8000110: 1dfb adds r3, r7, #7
- 8000112: 701a strb r2, [r3, #0]
- uint8_t bcdhigh = 0;
- 8000114: 230f movs r3, #15
- 8000116: 18fb adds r3, r7, r3
- 8000118: 2200 movs r2, #0
- 800011a: 701a strb r2, [r3, #0]
- while (Value >= 10)
- 800011c: e00b b.n 8000136 <RTC_ByteToBcd2_1+0x2e>
- {
- bcdhigh++;
- 800011e: 230f movs r3, #15
- 8000120: 18fb adds r3, r7, r3
- 8000122: 781a ldrb r2, [r3, #0]
- 8000124: 230f movs r3, #15
- 8000126: 18fb adds r3, r7, r3
- 8000128: 3201 adds r2, #1
- 800012a: 701a strb r2, [r3, #0]
- Value -= 10;
- 800012c: 1dfb adds r3, r7, #7
- 800012e: 1dfa adds r2, r7, #7
- 8000130: 7812 ldrb r2, [r2, #0]
- 8000132: 3a0a subs r2, #10
- 8000134: 701a strb r2, [r3, #0]
- while (Value >= 10)
- 8000136: 1dfb adds r3, r7, #7
- 8000138: 781b ldrb r3, [r3, #0]
- 800013a: 2b09 cmp r3, #9
- 800013c: d8ef bhi.n 800011e <RTC_ByteToBcd2_1+0x16>
- }
- return ((uint8_t)(bcdhigh << 4) | Value);
- 800013e: 230f movs r3, #15
- 8000140: 18fb adds r3, r7, r3
- 8000142: 781b ldrb r3, [r3, #0]
- 8000144: 011b lsls r3, r3, #4
- 8000146: b2da uxtb r2, r3
- 8000148: 1dfb adds r3, r7, #7
- 800014a: 781b ldrb r3, [r3, #0]
- 800014c: 4313 orrs r3, r2
- 800014e: b2db uxtb r3, r3
- }
- 8000150: 0018 movs r0, r3
- 8000152: 46bd mov sp, r7
- 8000154: b004 add sp, #16
- 8000156: bd80 pop {r7, pc}
复制代码
- static uint8_t RTC_ByteToBcd2_2(uint8_t Value)
- {
- 8000158: b580 push {r7, lr}
- 800015a: b082 sub sp, #8
- 800015c: af00 add r7, sp, #0
- 800015e: 0002 movs r2, r0
- 8000160: 1dfb adds r3, r7, #7
- 8000162: 701a strb r2, [r3, #0]
- return ((Value/10) << 4 | (Value & 0X0F));
- 8000164: 1dfb adds r3, r7, #7
- 8000166: 781b ldrb r3, [r3, #0]
- 8000168: 210a movs r1, #10
- 800016a: 0018 movs r0, r3
- 800016c: f000 f89a bl 80002a4 <__udivsi3>
- 8000170: 0003 movs r3, r0
- 8000172: b2db uxtb r3, r3
- 8000174: 011b lsls r3, r3, #4
- 8000176: b25a sxtb r2, r3
- 8000178: 1dfb adds r3, r7, #7
- 800017a: 781b ldrb r3, [r3, #0]
- 800017c: b25b sxtb r3, r3
- 800017e: 210f movs r1, #15
- 8000180: 400b ands r3, r1
- 8000182: b25b sxtb r3, r3
- 8000184: 4313 orrs r3, r2
- 8000186: b25b sxtb r3, r3
- 8000188: b2db uxtb r3, r3
- }
- 800018a: 0018 movs r0, r3
- 800018c: 46bd mov sp, r7
- 800018e: b002 add sp, #8
- 8000190: bd80 pop {r7, pc}
复制代码
咋一看,好像方法2的代码量是要少于库函数的里的实现的,但发现方法2有如下代码
bl 80002a4 <__udivsi3>
该函数是除法实现的相关代码,实现如下
- 080002a4 <__udivsi3>:
- 80002a4: 2200 movs r2, #0
- 80002a6: 0843 lsrs r3, r0, #1
- 80002a8: 428b cmp r3, r1
- 80002aa: d374 bcc.n 8000396 <__udivsi3+0xf2>
- 80002ac: 0903 lsrs r3, r0, #4
- 80002ae: 428b cmp r3, r1
- 80002b0: d35f bcc.n 8000372 <__udivsi3+0xce>
- 80002b2: 0a03 lsrs r3, r0, #8
- 80002b4: 428b cmp r3, r1
- 80002b6: d344 bcc.n 8000342 <__udivsi3+0x9e>
- 80002b8: 0b03 lsrs r3, r0, #12
- 80002ba: 428b cmp r3, r1
- 80002bc: d328 bcc.n 8000310 <__udivsi3+0x6c>
- 80002be: 0c03 lsrs r3, r0, #16
- 80002c0: 428b cmp r3, r1
- 80002c2: d30d bcc.n 80002e0 <__udivsi3+0x3c>
- 80002c4: 22ff movs r2, #255 ; 0xff
- 80002c6: 0209 lsls r1, r1, #8
- 80002c8: ba12 rev r2, r2
- 80002ca: 0c03 lsrs r3, r0, #16
- 80002cc: 428b cmp r3, r1
- 80002ce: d302 bcc.n 80002d6 <__udivsi3+0x32>
- 80002d0: 1212 asrs r2, r2, #8
- 80002d2: 0209 lsls r1, r1, #8
- 80002d4: d065 beq.n 80003a2 <__udivsi3+0xfe>
- 80002d6: 0b03 lsrs r3, r0, #12
- 80002d8: 428b cmp r3, r1
- 80002da: d319 bcc.n 8000310 <__udivsi3+0x6c>
- 80002dc: e000 b.n 80002e0 <__udivsi3+0x3c>
- 80002de: 0a09 lsrs r1, r1, #8
- 80002e0: 0bc3 lsrs r3, r0, #15
- 80002e2: 428b cmp r3, r1
- 80002e4: d301 bcc.n 80002ea <__udivsi3+0x46>
- 80002e6: 03cb lsls r3, r1, #15
- 80002e8: 1ac0 subs r0, r0, r3
- 80002ea: 4152 adcs r2, r2
- 80002ec: 0b83 lsrs r3, r0, #14
- 80002ee: 428b cmp r3, r1
- 80002f0: d301 bcc.n 80002f6 <__udivsi3+0x52>
- 80002f2: 038b lsls r3, r1, #14
- 80002f4: 1ac0 subs r0, r0, r3
- 80002f6: 4152 adcs r2, r2
- 80002f8: 0b43 lsrs r3, r0, #13
- 80002fa: 428b cmp r3, r1
- 80002fc: d301 bcc.n 8000302 <__udivsi3+0x5e>
- 80002fe: 034b lsls r3, r1, #13
- 8000300: 1ac0 subs r0, r0, r3
- 8000302: 4152 adcs r2, r2
- 8000304: 0b03 lsrs r3, r0, #12
- 8000306: 428b cmp r3, r1
- 8000308: d301 bcc.n 800030e <__udivsi3+0x6a>
- 800030a: 030b lsls r3, r1, #12
- 800030c: 1ac0 subs r0, r0, r3
- 800030e: 4152 adcs r2, r2
- 8000310: 0ac3 lsrs r3, r0, #11
- 8000312: 428b cmp r3, r1
- 8000314: d301 bcc.n 800031a <__udivsi3+0x76>
- 8000316: 02cb lsls r3, r1, #11
- 8000318: 1ac0 subs r0, r0, r3
- 800031a: 4152 adcs r2, r2
- 800031c: 0a83 lsrs r3, r0, #10
- 800031e: 428b cmp r3, r1
- 8000320: d301 bcc.n 8000326 <__udivsi3+0x82>
- 8000322: 028b lsls r3, r1, #10
- 8000324: 1ac0 subs r0, r0, r3
- 8000326: 4152 adcs r2, r2
- 8000328: 0a43 lsrs r3, r0, #9
- 800032a: 428b cmp r3, r1
- 800032c: d301 bcc.n 8000332 <__udivsi3+0x8e>
- 800032e: 024b lsls r3, r1, #9
- 8000330: 1ac0 subs r0, r0, r3
- 8000332: 4152 adcs r2, r2
- 8000334: 0a03 lsrs r3, r0, #8
- 8000336: 428b cmp r3, r1
- 8000338: d301 bcc.n 800033e <__udivsi3+0x9a>
- 800033a: 020b lsls r3, r1, #8
- 800033c: 1ac0 subs r0, r0, r3
- 800033e: 4152 adcs r2, r2
- 8000340: d2cd bcs.n 80002de <__udivsi3+0x3a>
- 8000342: 09c3 lsrs r3, r0, #7
- 8000344: 428b cmp r3, r1
- 8000346: d301 bcc.n 800034c <__udivsi3+0xa8>
- 8000348: 01cb lsls r3, r1, #7
- 800034a: 1ac0 subs r0, r0, r3
- 800034c: 4152 adcs r2, r2
- 800034e: 0983 lsrs r3, r0, #6
- 8000350: 428b cmp r3, r1
- 8000352: d301 bcc.n 8000358 <__udivsi3+0xb4>
- 8000354: 018b lsls r3, r1, #6
- 8000356: 1ac0 subs r0, r0, r3
- 8000358: 4152 adcs r2, r2
- 800035a: 0943 lsrs r3, r0, #5
- 800035c: 428b cmp r3, r1
- 800035e: d301 bcc.n 8000364 <__udivsi3+0xc0>
- 8000360: 014b lsls r3, r1, #5
- 8000362: 1ac0 subs r0, r0, r3
- 8000364: 4152 adcs r2, r2
- 8000366: 0903 lsrs r3, r0, #4
- 8000368: 428b cmp r3, r1
- 800036a: d301 bcc.n 8000370 <__udivsi3+0xcc>
- 800036c: 010b lsls r3, r1, #4
- 800036e: 1ac0 subs r0, r0, r3
- 8000370: 4152 adcs r2, r2
- 8000372: 08c3 lsrs r3, r0, #3
- 8000374: 428b cmp r3, r1
- 8000376: d301 bcc.n 800037c <__udivsi3+0xd8>
- 8000378: 00cb lsls r3, r1, #3
- 800037a: 1ac0 subs r0, r0, r3
- 800037c: 4152 adcs r2, r2
- 800037e: 0883 lsrs r3, r0, #2
- 8000380: 428b cmp r3, r1
- 8000382: d301 bcc.n 8000388 <__udivsi3+0xe4>
- 8000384: 008b lsls r3, r1, #2
- 8000386: 1ac0 subs r0, r0, r3
- 8000388: 4152 adcs r2, r2
- 800038a: 0843 lsrs r3, r0, #1
- 800038c: 428b cmp r3, r1
- 800038e: d301 bcc.n 8000394 <__udivsi3+0xf0>
- 8000390: 004b lsls r3, r1, #1
- 8000392: 1ac0 subs r0, r0, r3
- 8000394: 4152 adcs r2, r2
- 8000396: 1a41 subs r1, r0, r1
- 8000398: d200 bcs.n 800039c <__udivsi3+0xf8>
- 800039a: 4601 mov r1, r0
- 800039c: 4152 adcs r2, r2
- 800039e: 4610 mov r0, r2
- 80003a0: 4770 bx lr
- 80003a2: e7ff b.n 80003a4 <__udivsi3+0x100>
- 80003a4: b501 push {r0, lr}
- 80003a6: 2000 movs r0, #0
- 80003a8: f000 f806 bl 80003b8 <__aeabi_idiv0>
- 80003ac: bd02 pop {r1, pc}
- 80003ae: 46c0 nop ; (mov r8, r8)
复制代码 这么一比较的话库函数的代码效率要明显,比自己实现的简洁的代码效率要高。
总结:
库函数的代码实现虽然不够简洁,但代码的实现充分考虑了代码运行的效率性。
|
感谢 打赏支持
的确,有好的硬件,还得有高效的程序。
所谓DSP库,就是把代码优化到效率最高。
如果不用库,自己胡写的话,就算是TI的DSP处理器的高性能都没得发挥。
大佬说的很有道理。