Mandelbrot Generator
Posté le 15/06/2019 10:38
Hello allô
Default 1x zoom takes 7sec
Max zoom takes around 5-10min
It has a max zoom of 2^50: over one Quadrillion!
Going over 2^48 can be rather buggy
This is because numbers are limited to the 8 byte double variables
Attached file is both SH4 and SH3 compatible:
MANDEL.G1A
This does need the 'MonochromeLib' libs the code comes with it now
Controls
[-] Zoom out
[+] Zoom in
[F1] Hide/show HUD which contains Cords, Zoom level and Max Iterations. (Heads Up Display)
[F2] Changes colours of camera rectangle: Black, White & Inverted
[AC] Resets screen back to default state
[EXE] Draw set
[EXIT] Stop drawing the Mandelbrot (If it's taking too long)
[MENU] Return to the menu screen
[REPLAY] Move camera rectangle around (Arrow Keys: [LEFT], [RIGHT], [UP], [DOWN])
How can I optimize this code to run faster or zoom in further?
#include "fxlib.h"
#include "stdio.h"
#define TRUE 1
#define FALSE 0
#define ML_vram_adress (*(sc_cpv)sc0135)
typedef enum { ML_TRANSPARENT = -1, ML_WHITE, ML_BLACK, ML_XOR, ML_CHECKER } ML_Color;
typedef char* (*sc_cpv)(void);
const unsigned int sc0135[] = { 0xD201D002, 0x422B0009, 0x80010070, 0x0135 };
unsigned int key; //pause until key press
int kcode1, kcode2; //row & col keycode for Bkey_GetKeyWait()
char unused; //unused (cause CASIO dumb dumb)
unsigned short dispX, dispY; //cords on display when drawing mandelbrot
void ML_clear_vram() {
int i, end, * pointer_long, vram;
char* pointer_byte;
vram = (int)ML_vram_adress();
end = 4 - vram & 3;
pointer_byte = (char*)vram;
for (i = 0; i < end; i++) pointer_byte[i] = 0;
pointer_long = (int*)(vram + end);
for (i = 0; i < 255; i++) pointer_long[i] = 0;
pointer_byte += 1020 + end;
end = vram & 3;
for (i = 0; i < end; i++) pointer_byte[i] = 0;
}
void ML_display_vram() {
char* LCD_register_selector = (char*)0xB4000000, * LCD_data_register = (char*)0xB4010000, * vram;
int i, j;
vram = ML_vram_adress();
for (i = 0; i < 64; i++) {
*LCD_register_selector = 4;
*LCD_data_register = i | 192;
*LCD_register_selector = 4;
*LCD_data_register = 0;
*LCD_register_selector = 7;
for (j = 0; j < 16; j++)
*LCD_data_register = *vram++;
}
}
void ML_display_vram_row(int row) { //faster than ML_display_vram() which displays the entire screen instead of a single row
unsigned char i;
char* LCD_register_selector = (char*)0xB4000000, *LCD_data_register = (char*)0xB4010000, *vram;
vram = (row << 4) + ML_vram_adress();
*LCD_register_selector = 4;
*LCD_data_register = row | 192;
*LCD_register_selector = 4;
*LCD_data_register = 0;
*LCD_register_selector = 7;
for (i = 0; i < 16; i++)
* LCD_data_register = *vram++;
}
void ML_horizontal_line(int y, int x1, int x2, ML_Color color) {
int i;
char checker;
char* vram = ML_vram_adress();
if (y & ~63 || (x1 < 0 && x2 < 0) || (x1 > 127 && x2 > 127))
return;
if (x1 > x2) {
i = x1;
x1 = x2;
x2 = i;
}
if (x1 < 0)
x1 = 0;
if (x2 > 127)
x2 = 127;
switch (color) {
case ML_BLACK:
if (x1 >> 3 != x2 >> 3) {
vram[(y << 4) + (x1 >> 3)] |= 255 >> (x1 & 7);
vram[(y << 4) + (x2 >> 3)] |= 255 << 7 - (x2 & 7);
for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
vram[(y << 4) + i] = 255;
} else
vram[(y << 4) + (x1 >> 3)] |= (255 >> (x1 % 8 + 7 - x2 % 8)) << (7 - (x2 & 7));
break;
case ML_WHITE:
if (x1 >> 3 != x2 >> 3) {
vram[(y << 4) + (x1 >> 3)] &= 255 << 8 - (x1 & 7);
vram[(y << 4) + (x2 >> 3)] &= 255 >> 1 + (x2 & 7);
for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
vram[(y << 4) + i] = 0;
} else
vram[(y << 4) + (x1 >> 3)] &= (255 << 8 - (x1 & 7)) | (255 >> 1 + (x2 & 7));
break;
case ML_XOR:
if (x1 >> 3 != x2 >> 3) {
vram[(y << 4) + (x1 >> 3)] ^= 255 >> (x1 & 7);
vram[(y << 4) + (x2 >> 3)] ^= 255 << 7 - (x2 & 7);
for (i = (x1 >> 3) + 1; i < (x2 >> 3); i++)
vram[(y << 4) + i] ^= 255;
} else
vram[(y << 4) + (x1 >> 3)] ^= (255 >> ((x1 & 7) + 7 - (x2 & 7))) << (7 - (x2 & 7));
break;
case ML_CHECKER:
checker = (y & 1 ? 85 : 170);
if (x1 >> 3 != x2 >> 3) {
vram[(y << 4) + (x1 >> 3)] &= 255 << 8 - (x1 & 7);
vram[(y << 4) + (x2 >> 3)] &= 255 >> 1 + (x2 & 7);
vram[(y << 4) + (x1 >> 3)] |= checker & 255 >> (x1 & 7);
vram[(y << 4) + (x2 >> 3)] |= checker & 255 << 7 - (x2 & 7);
for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
vram[(y << 4) + i] = checker;
} else {
vram[(y << 4) + (x1 >> 3)] &= (255 << 8 - (x1 & 7)) | (255 >> 1 + (x2 & 7));
vram[(y << 4) + (x1 >> 3)] |= checker & (255 >> (x1 % 8 + 7 - x2 % 8)) << (7 - (x2 & 7));
}
break;
}
}
void ML_vertical_line(int x, int y1, int y2, ML_Color color) {
int i, j;
char checker, byte, * vram = ML_vram_adress();
if (x & ~127 || (y1 < 0 && y2 < 0) || (y1 > 63 && y2 > 63)) return;
if (y1 > y2) {
int tmp = y1;
y1 = y2;
y2 = tmp;
}
if (y1 < 0) y1 = 0;
if (y2 > 63) y2 = 63;
i = (y1 << 4) + (x >> 3);
j = (y2 << 4) + (x >> 3);
switch (color) {
case ML_BLACK:
byte = 128 >> (x & 7);
for (; i <= j; i += 16)
vram[i] |= byte;
break;
case ML_WHITE:
byte = ~(128 >> (x & 7));
for (; i <= j; i += 16)
vram[i] &= byte;
break;
case ML_XOR:
byte = 128 >> (x & 7);
for (; i <= j; i += 16)
vram[i] ^= byte;
break;
case ML_CHECKER:
byte = 128 >> (x & 7);
checker = y1 & 1 ^ x & 1;
for (; i <= j; i += 16) {
if (checker) vram[i] &= ~byte;
else vram[i] |= byte;
checker = !checker;
}
break;
}
}
void ML_pixel(int x, int y, ML_Color color) {
char* vram = ML_vram_adress();
if (x & ~127 || y & ~63) return;
switch (color) {
case ML_BLACK:
vram[(y << 4) + (x >> 3)] |= 128 >> (x & 7);
break;
case ML_WHITE:
vram[(y << 4) + (x >> 3)] &= ~(128 >> (x & 7));
break;
case ML_XOR:
vram[(y << 4) + (x >> 3)] ^= 128 >> (x & 7);
break;
case ML_CHECKER:
if (y & 1 ^ x & 1) vram[(y << 4) + (x >> 3)] &= ~(128 >> (x & 7));
else vram[(y << 4) + (x >> 3)] |= 128 >> (x & 7);
break;
}
}
double divByPow(double n, double x, int p) { //Divide OR Times n by x, p times (n / x^p): used for numbers bigger than 2^32 (int limit)
if (p < 0)
for (; p < 0; p++)
n *= x;
else
for (; p > 0; p--)
n /= x;
return n;
}
void stop(void) { //stops drawing set if user presses [EXIT] or [MENU]
if (Bkey_GetKeyWait(&kcode1, &kcode2, 1, 0, 1, &unused))
if (kcode1 == 4 && (kcode2 == 8 || kcode2 == 9)) {
dispX = 128; //Very hacky stop function
dispY = 64;
}
}
int AddIn_main(int isAppli, unsigned short OptionNum) { //Main function
unsigned int graphZoom = 1; //zoom level for graph
char screenZoom; //zoom level on screen (rectangle)
int screenX1, screenX2; //corner X cords for drawing rectangle to screen
int screenY1, screenY2; //corner Y cords for drawing rectangle to screen
unsigned char string[1]; //Used in converting int/double to char
char HUD = TRUE; //Heads Up Display: Cords, Zoom level & Max iteration: toggle with [F1]
char colour = ML_XOR; //Colour of rectangle: Black, White or Inverted
int screenX, screenY; //offset cords on screen from 0,0 for rectangle
double graphX = 0, graphY = 0; //cords on graph - where to center mandelbrot
double graphMove; //amount graphX & Y changes by when moving rectangle around
int screenMove; //amount screenX & Y changes by when moving rectangle around with arrow keys
short tempPixel = 0; //Write pixels to temp variable then write the entire 2bytes to VRAM all at once
register double zr, zi; //zr is real, zi imaginary
register double zr2, zi2; //zr2 = zr^2, zi2 = zi^2
register double x1 = -2.0; //bounding box cords on graph
register double x2 = 2.0; //bounding box cords on graph
register double y1 = -1.0; //bounding box cords on graph
register double y2 = 1.0; //bounding box cords on graph
register double x, y; //pixel cords on graph tested if in set
register double xIsz, yIsz; //amount x/y increases by when ploting graph
register unsigned short iMax = 32; //max iterations
register unsigned short i; //iterations
while (TRUE) {
register char* vram = ML_vram_adress();
SetTimer(1, 200, stop);
ML_clear_vram();
ML_display_vram();
xIsz = (x2 - x1) / 128;
yIsz = (y2 - y1) / 64;
y = y1;
for (dispY = 0; dispY < 64; dispY++) {
x = x1;
y += yIsz;
for (dispX = 0; dispX < 128; dispX++) {
zr = x;
zi = y;
for (i = 0; i < iMax; i++) {
zr2 = zr * zr;
zi2 = zi * zi;
if (zr2 + zi2 > 4)
break;
zi = zr * zi;
zi += zi + y;
zr = zr2 - zi2 + x;
}
tempPixel = (tempPixel << 1) | (i == iMax);
if ((dispX & 7) == 7)
*vram++ = tempPixel;
x += xIsz;
}
ML_display_vram_row(dispY);
}
SaveDisp(1);
KillTimer(1);
screenX = 0;
screenY = 0;
screenZoom = 1;
Bkey_GetKeyWait(&kcode1, &kcode2, 2, 1, 1, &unused);
do {
GetKey(&key);
screenMove = screenZoom > 4 ? 1 : divByPow(16, 2, screenZoom);
graphMove = screenZoom > 4 ? divByPow(1, 2, graphZoom - (double)screenZoom) : divByPow(16, 2, graphZoom);
switch (key) {
case KEY_CHAR_PLUS:
if (graphZoom < 51) {
graphZoom++;
screenZoom++;
}
break;
case KEY_CHAR_MINUS:
if (graphZoom) {
graphZoom--;
screenZoom--;
}
break;
case KEY_CTRL_UP:
screenY -= screenMove;
graphY -= graphMove;
break;
case KEY_CTRL_DOWN:
screenY += screenMove;
graphY += graphMove;
break;
case KEY_CTRL_LEFT:
screenX -= screenMove;
graphX -= graphMove;
break;
case KEY_CTRL_RIGHT:
screenX += screenMove;
graphX += graphMove;
break;
case KEY_CTRL_F1:
HUD = !HUD;
break;
case KEY_CTRL_F2:
if (colour)
colour--;
else
colour = ML_XOR;
break;
case KEY_CTRL_F3:
//Gray scale, by refreshing screen multiple times per sec at different max iterations (iMax)
break;
case KEY_CTRL_AC:
graphZoom = 1;
graphX = 0;
graphY = 0;
screenZoom = 1;
screenX = 0;
screenY = 0;
key = KEY_CTRL_EXE;
break;
}
RestoreDisp(1);
iMax = 8 * (graphZoom + 3);
if (screenZoom < 8) {
screenX1 = 65 - divByPow(128, 2, screenZoom) + screenX;
screenX2 = 62 + divByPow(128, 2, screenZoom) + screenX;
screenY1 = 32 - (screenZoom > 6 ? 1 : divByPow(64, 2, screenZoom)) + screenY;
screenY2 = 31 + (screenZoom > 6 ? 0 : divByPow(64, 2, screenZoom)) + screenY;
ML_horizontal_line(screenY1, screenX1, screenX2, colour);
ML_horizontal_line(screenY2, screenX1, screenX2, colour);
ML_vertical_line(screenX1 - 1, screenY1, screenY2, colour);
ML_vertical_line(screenX2 + 1, screenY1, screenY2, colour);
} else
ML_pixel(screenX + 64, screenY + 31, colour);
x1 = divByPow(-4, 2, graphZoom) + (0.03125 * graphX);
x2 = divByPow(4, 2, graphZoom) + (0.03125 * graphX);
y1 = divByPow(-2, 2, graphZoom) + (0.03125 * graphY);
y2 = divByPow(2, 2, graphZoom) + (0.03125 * graphY);
if (HUD == TRUE) {
sprintf(&string, "X1:%f", x1);
PrintMini(0, 0, string, 0);
sprintf(&string, "Y1:%f", y1);
PrintMini(0, 6, string, 0);
sprintf(&string, "X2:%f", x2);
PrintMini(81, 53, string, 0);
sprintf(&string, "Y2:%f", y2);
PrintMini(81, 59, string, 0);
sprintf(&string, "MaxI:%u", iMax);
PrintMini(0, 53, string, 0);
if (graphZoom > 32)
sprintf(&string, "Zoom:2^%ux", graphZoom - 1);
else
sprintf(&string, "Zoom:%ux", (int)divByPow(1, 2, -graphZoom + 1));
PrintMini(0, 59, string, 0);
}
ML_display_vram();
} while (key != KEY_CTRL_EXE);
}
return 0;
}
#pragma section _BR_Size
unsigned long BR_Size;
#pragma section
#pragma section _TOP
int InitializeSystem(int isAppli, unsigned short OptionNum) {
return INIT_ADDIN_APPLICATION(isAppli, OptionNum);
}
#pragma section
Fichier joint
Citer : Posté le 08/11/2019 23:09 | #
Of course, this is the normal pass-by-value mechanism.
You can either return a value in r0:
add #50, r4
rts
mov r4, r0
Or you can take a pointer as argument:
mov.l @r4, r0
add #50, r0
rts
mov.l r0, @r4
In both of these exemples the mov that follows the rts is executed before the function returns due do the delay slot mechanism.
Citer : Posté le 08/11/2019 23:20 | #
I tried both ways and it doesn't seem to do anything
mandel.lst
Mandel.c 61 int testVar = 27;
0000010C E31B MOV #27,R3
Mandel.c 67 while (1) {
00000118 L341:
Mandel.c 70 foo(&testVar);
0000011E 64F3 MOV R15,R4
00000120 D336 MOV.L L351+6,R3 ; _foo
00000122 1F03 MOV.L R0,@(12,R15)
00000124 430B JSR @R3
00000126 7408 ADD #8,R4
//some other code is here, printing testVar to screen
000001F2 AF91 BRA L341
mandelsrc.src
.ALIGN 4
_foo:
add #5, r4
rts
mov r4, r0
.ALIGN 4
.END
Ajouté le 08/11/2019 à 23:27 :
oops was modifying the wrong file
Im using Microsoft visual studio to edit the file (has syntax highlighting etc)
and when I renamed the .src file, I forgot to update it in visual studio
working now
Citer : Posté le 09/11/2019 08:38 | #
Glad to see you've got it sorted out.
Citer : Posté le 09/11/2019 10:08 | #
Does the SDK list the code in the .src file in the .lst file?
because I dont see any of my code in it
theres no mul codes in the .lst file, while my .src file has heaps of them
I wanna compare the amount of codes the SDK has vs how many I have, but it doesn't have the final converted codes
mandel.lst
0000015C D22A MOV.L L347+24,R2 ; _innerLoop
0000015E 67F3 MOV R15,R7
00000160 770C ADD #12,R7
00000162 66F3 MOV R15,R6
00000164 7608 ADD #8,R6
00000166 65F3 MOV R15,R5
00000168 64F3 MOV R15,R4
0000016A 420B JSR @R2
0000016C 7404 ADD #4,R4
mandelsrc.src
mov.l #-13, r3
mov.l @r4, r0
shad r3, r0
mov.l r0, @r4
mov.l @r5, r1
shad r3, r1
mov.l r1, @r5
muls.l r0, r0
sts macl, r0
mov.l r0, @r6
muls.l r1, r1
sts macl, r1
mov.l r1, @r7
rts
nop
Citer : Posté le 09/11/2019 10:21 | #
This is because you function cannot be inlined. As you can see, the first line of the listing loads the address of innerLoop into r2 and the second-to-last line jumps to r2 as a subroutine call with jsr. So you assembler function is being called as a normal function.
It is not easy for a compiler to inline assembler functions and it is very probably beyond the capabilities of the SDK's compiler. If you want the code that calls the inner loop to be in the same function as the code that executes it, you need to write both in assembler. Eventually you should reach a level where paying a function call has little to no performance impact and you can stop moving things to assembler code.
Citer : Posté le 09/11/2019 10:29 | #
does @(8,R15) mean postion 8 in register 15? (AKA the stack)
(MOV.L R2,@(8,R15))
is 'mov' the only code that can handle #13, #H'13 and @r4 values vs just r0 - r14
If you want the code that calls the inner loop to be in the same function as the code that executes it, you need to write both in assembler. Eventually you should reach a level where paying a function call has little to no performance impact and you can stop moving things to assembler code
So I write the entire critical loop in assembly?
and the single function call to start it (all 3 for loops) shouldn't cause any performace impact
Citer : Posté le 09/11/2019 10:57 | #
It means the data in memory at address (r15+8). Note that 8 is a number of bytes, so if you accidentally try fi. mov.l @(2,r15),r0, this will fail because r15+2 is 2-aligned.
Not many instructions support directly accessing pointer memory (@ mode). Many instruction support immediate operands (# mode), but often you have to use them with r0. r0 is a bit special because it is used as a "special-purpose register" when instructions do not support all registers.
For instance, you can write or #2, r0 but not or #2, r1.
and the single function call to start it (all 3 for loops) shouldn't cause any performace impact
Yes, you can proceed this way.
Citer : Posté le 10/11/2019 01:39 | #
How to safely read and write to r15 without overwriting anything?
and how to access the 5th 6th etc variables from r15?
Whats @-R15?
how freely can I use r5-r7 and r8-r14?
is using a short instead of a int slower?
because I noticed this bit of code only shows when its a short
it was working fine for ages
then all of a sudden Im getting this error
even if the mandel.asm file is empty
"D:\Documents\CASIO\fx-9860G SDK\_fx-9860G SDK\OS\SH\bin\Optlnk.exe" -subcommand=C:\Users\RedCMD\AppData\Local\Temp\hmk35AE.tmp
** L3300 (F) Cannot open file : "D:\Documents\CASIO\fx-9860G SDK\Mandel\Debug\Mandelassmbly.obj"
Citer : Posté le 10/11/2019 10:48 | #
Since r15 is the stack pointer, its value should never really change except by small amounts or making space for local variables. It should always be 4-aligned because interrupt handlers will assume this.
The first longword at the top of the stack is @r15, the second one is @(4,r15), the third is @(8,r15) and so on.
mov.l r0,@-r15 decrements r15 by 4 bytes then writes r0 at the resulting address. This is, in essence, a stack push. Be careful as this changes the offsets (the variable at @(4,r15) is now at @(8,r15), which is easy to forget).
You can freely use r0 to r7 provided that you don't need their values (ie. if you trash the arguments then don't complain). For r8 to r14, you need to save them and restore them at the end of the function, typically like this:
mov.l r8, @-r15
mov.l r9, @-r15
# Do things...
mov r4, r8
mov #0, r9
# Restore and leave
mov.l @r15+, r9
rts
mov.l @r15+, r8
Using shorts is not slower by essence, but when you load a short from memory only the low 16 bits of the target register are updated. For instance:
mov #4, r1
mov.w r1, @r5
mov.w @r5, r0 # r0 = 0xffff0004
The extu.w instructions means ExTend Unsigned from Word and fills the high 16 bits with zeros. This is necessary unless you can prove that the high 16 bits are already zero. This is a trivial instruction that executes fast.
On SH4, when you are using an array, it is better to use an array of shorts and use this instruction because the array will be much smaller so you will have less cache misses. Generally memory is the bottleneck, I'd say you need 4/5 CPU cycles to equate a memory cycle, although don't take my word on that.
Maybe try to Rebuild All (Project » Rebuild All) or delete the Debug folder and rebuild.
Citer : Posté le 10/11/2019 10:56 | #
Thanks for all your help
even tho I havn't done anything in return
still getting the errorIf I rename to mandel.src it works, but then theres no syntax highlighting in visual studio
and now its working again :|
when I reinstall the SDK, it still remembers what my recent projects were
is there some cache it hides somewhere on my computer?
Citer : Posté le 10/11/2019 11:01 | #
Don't mention it, that's what this place is for
If I rename to mandel.src it works, but then theres no syntax highlighting in visual studio
Looks like the file names in your project are a bit messed up. According to the log the SDK seems to expect Mandelassmbly.src.
Regarding the extension, you don't have much of a choice because AFAIK if your file is not named .src the SDK will not try to compile it as assembler. I would have suggested to use a symbolic link if such a thing existed...
Recent project is definitely in a sort of cache, but I don't know where. It might be in the Windows registry. If not, then check the SDK files (typically C:\Program Files\CASIO\fx-9860G SDK) just in case something happens to look like it.
Citer : Posté le 10/11/2019 11:06 | #
well its working now
the symbolic link works fine
after unistalling the SDK (a few days ago, not right now)
I went and search my entire disk for all casio related files (just by doing a CTRL+F "casio") and also looking at the most common places where it could be
So it must be in registry then
Citer : Posté le 10/11/2019 11:08 | #
Well, as long as it works, I guess it's fine...
For the code you can use the backtick ` (U+0060) or the [inlinecode] tag (useful if you are trying to write literal backticks).
Citer : Posté le 11/11/2019 04:31 | #
SDK doesn't update the mandelasm.src file when I edit it from visual basic. - maybe cause Im using a symlink? (mandelsrc.asm)
reload and rebuild all doesn't work, have to restart the SDK
it restarts in less than a sec, so isn't too bad
even removing and re-adding the file doesn't work
EDIT
rebuild all gets the correct updated file, while the SDK still thinks the file hasn't been changed - so I don't have to restart it
Is there way to map build to rebuild all?
I got the entire critical loop code converted into assembly
its also about 14% faster too
Gonna work on making it 64bits now
mandelasm.src
.ALIGN 4
_mainLoop: ;mainLoop(xStart, yStart, Isz, *vram, iMax) //row, col, x, y, zr2, zi2
; r4, r5, r6, r7, r8 // r9, r10,11,12, 13, 14
;temp, zr, zi, const, tempPixel
; r0, r1, r2, r3, r5
mov.l r8, @-r15
mov.l r9, @-r15
mov.l r10, @-r15
mov.l r11, @-r15
mov.l r12, @-r15
mov.l r13, @-r15
mov.l r14, @-r15
mov.l #64, r9 ;row = 64
mov.l r5, r12 ;y = yStart
mov.l #0, r5 ;tempPixel = 0 //r5 not yStart anymore
_row: ;for (row = 64; row > 0; row--) {
mov.l #128, r10 ;col = 128
mov.l r4, r11 ;x = xStart
add r6, r12 ;y += Isz
_col: ;for (col = 128; col > 0; col--) {
mov.l r11, r1 ;zr = x
mov.l r12, r2 ;zi = y
mov.l @(28,r15), r8 ;i = iMax
_innerLoop: ;for (i = iMax; i > 0; i--) {
mov.l #-13, r3
shad r3, r1 ;zr >>= 13
shad r3, r2 ;zi >>= 13
mul.l r1, r1 ;zr * zr
sts macl, r13 ;zr2 = zr * zr
mul.l r2, r2 ;zi * zi
sts macl, r14 ;zi2 = zi * zi
mov.l #268435456, r3 ;0x10000000
mov.l r13, r0
add r14, r0
cmp/gt r3, r0 ;if (zr2 + zi2 > 0x10000000)
bt _exitInnerLoop ;break
nop
mul.l r1, r2
sts macl, r2 ;zi *= zr
shll r2
add r12, r2 ;zi += zi + y
sub r14, r13 ;zr2 - zi2
add r11, r13 ; + x
mov.l r13, r1 ;zr = zr2 - zi2 + x
dt r8 ;if(i = 0)
bf _innerLoop ;then break; else jmp _innerLoop
nop
_exitInnerLoop:
shll r5 ;tempPixel << 1
tst r8, r8 ;i == 0
movt r0
or r0, r5 ;tempPixel = (tempPixel << 1) | (i == 0)
mov.l r10, r0 ;col
and #7, r0 ;col & 7
cmp/eq #1, r0 ;if((col & 7) == 1)
bf _bypassVRAM
nop
mov.b r5, @r7 ;*vram = tempPixel
add #1, r7 ;vram++
_bypassVRAM:
add r6, r11 ;x += Isz
dt r10 ;if(col = 0)
bf _col ;then break; else jmp _col
nop
dt r9 ;if(row = 0)
bf _row ;then break; else jmp _row
nop
;_end:
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r12
mov.l @r15+, r11
mov.l @r15+, r10
mov.l @r15+, r9
mov.l @r15+, r8
rts
nop
.ALIGN 4
.END
Is it safe to have 0x88004991 in mainLoop(xStart, yStart, Isz, 0x88004991, iMax)? (mandel.c)
the number comes from *vram, tho I couldn't figure out how to place it in there
got super confused with double pointers, arrays and memory swapping
mandel.c
typedef char* (*sc_cpv)(void);
const unsigned int sc0135[] = { 0xD201D002, 0x422B0009, 0x80010070, 0x0135 };
char* vram = ML_vram_adress();
What sort of opitmizations could I do?
I can move mov.l @r15+, r8 down to replace the nop, tho I couldn't see any others I could do that to
I think its possible to drop a few of the registers
Citer : Posté le 11/11/2019 09:53 | #
SDK doesn't update the mandelasm.src file when I edit it from visual basic. - maybe cause Im using a symlink? (mandelsrc.asm)
As I understand, you are editing a .asm file with an external program. What you must do is update the .src file in some way. Either you can edit it directly. Or you can copy it every time you save it. Or maybe there is a sort of symbolinc link mechansim in Windows that can help you have two different names for the same file.
My advice is to just have one .src file and instruct your text editor to highlight it as assembler. If your editor is sufficiently powerful this should be easy to do. My reasoning is that it is easier to bend a powerful text editor than to bend the limited SDK.
I don't think so, but back when I used the SDK I used to run Rebuild All by keyboard shortcut, something like Alt+P, U IIRC. This might be an acceptable compromise for you.
its also about 14% faster too
Good job!
This contributes to showing that the SDK's compiler does not perform a lot of optimizations.
Here are a few comments I hope can help you:
bf _row ;then break; else jmp _row
nop
The bf and bt insructions are not delay slots. The delay slots equivalents are bf.s and bt.s. This means the nop is currently useless. If possible, you will gain 1 cycle by using bf.s and inserting a useful instruction in the delay slot. The same applies here:
nop
Except that if the condition is not taken, you will lose one cycle in the inner loop executing a nop, which is sad.
mov.l r4, r11 ;x = xStart
I don't know about the SDK, but this syntax is not correct in GCC. The .l suffix is only used to indicate the size of memory accesses. Here you don't have any memory access, and you don't have the choice of the size. The first instruction sign-extends the constant from 8 bits to 32 bits and assigns the entirety of r10. The second instructions copies the entirety of r4 to r11. Both are normally written mov instead of mov.l.
sts macl, r13 ;zr2 = zr * zr
mul.l r2, r2 ;zi * zi
sts macl, r14 ;zi2 = zi * zi
The multiplier needs 3 to 5 cycles to compute its result. If you access the macl or mach register just after multiplying, you will wait 5 cycles. If you can find two instructions to place in-between, you will wait only 3 cycles. In other words there is a pipeline bubble here only waiting to be filled.
Again, I don't know what the SDK allows, but you cannot directly to this in the instruction set. The SuperH has 16-bit instructions, so it should be obvious that a 32-bit operand cannot fit in. How this works normally is, the constant is placed in memory below the program and the special addressing mode @(disp,pc) is used. This instructs the processor to read the operand a certain distance away from the instruction. In assembly code, a label is normally used. The assembler will turn the label into the appropriate @(disp,pc) command. You should be aware that an additional memory access occurs on this line.
tst r8, r8 ;i == 0
movt r0
or r0, r5 ;tempPixel = (tempPixel << 1) | (i == 0)
There is a cool optimization here.
Instead of shll you can use the rotcl instruction which rotates left. The cool thing about this instruction is, it rotates with T: the highest bit goes to T and T becomes the lowest bit. So you can compute T with the tst instruction then shift and insert it in a single line with:
rotcl r5
You mastered the assembler principles really fast. I am impressed
the number comes from *vram, tho I couldn't figure out how to place it in there
got super confused with double pointers, arrays and memory swapping
You should not hardcode the VRAM address. Please use the syscall, either as you did in C:
typedef char* (*sc_cpv)(void);
const unsigned int sc0135[] = { 0xD201D002, 0x422B0009, 0x80010070, 0x0135 };
char* vram = ML_vram_adress();
Or properly in assembler (the syntax might need to be changed a bit for the SDK):
mov.l syscall, r1
mov.l vram_addr, r0
jmp @r1
nop
syscall:
.long #h'80010070
vram_addr:
.long #h'00000135
I think its possible to drop a few of the registers
You can do this, but remember that the critical part is the inner loop. Pushing and popping a few registers just once is nothing. I think a decent approach to optimizing the inner loop is reordering instructions to use the pipeline more. You have delays between multiplications and accesses to macl, unused delay slots in jumps, and when the code is really good and won't change anymore (after you finish with 64-bit that is), you can start putting memory accessed on 2-aligned addresses to improve the pipeline flow by reducing memory contention.
This is becoming really interesting. Probably the most optimized Mandelbrot viewer out there. Keep it up
Citer : Posté le 11/11/2019 20:01 | #
mul.l r1, r1
sts macl, r0
sts macl, r1
Citer : Posté le 11/11/2019 20:11 | #
Correct. and I'm using a symlink that points to mandelsrc.asm that is called mandelasm.src which is used by the SDK
As long as it works I don't see a problem. I didn't know that symlinks existed on Windows.
Well looking at the inner loop there is certainly stuff that can be moved:
add r11, r13 ; + x
mov.l r13, r1 ;zr = zr2 - zi2 + x
dt r8 ;if(i = 0)
bf _innerLoop ;then break; else jmp _innerLoop
nop
Why not just this?
add r11, r13 ; + x
dt r8 ;if(i = 0)
bf.s _innerLoop ;then break; else jmp _innerLoop
mov.l r13, r1 ;zr = zr2 - zi2 + x
Oh no you can't. You should see from the code that r0 and r1 will be equal. The first product result will be lost.
However you do have inpedendent stuff going on:
shad r3, r1 ;zr >>= 13
shad r3, r2 ;zi >>= 13
mul.l r1, r1 ;zr * zr
sts macl, r13 ;zr2 = zr * zr
Since the value of r2 is not needed for the first product, you can write for instance:
shad r3, r1 ;zr >>= 13
mul.l r1, r1 ;zr * zr
shad r3, r2 ;zi >>= 13
sts macl, r13 ;zr2 = zr * zr
Yeah this is pretty elaborate already. You should know that you can execute at most one memory access per CPU cycle (and if it goes to RAM it takes even longer). Memory accesses are already needed to fetch the instructions. However, instructions are 16-bit and the memory bus is 32-bit wide so the processor can read instructions by pairs. This happens when the program counter is 4-aligned.
This means that exactly half of the cycles already have memory accesses before your code can even do anything. If the memory access of one of your mov.l (or similar) instructions falls on the same pipeline slot as an automatic instruction fetch, then contention occurs and one of the accesses will be delayed. Basically you lose a cycle or more.
The trick to avoid this is to arrange your instructions so that their memory access stages (MA) do not happen during instruction fetches (IF) but between them. Since MA occurs 3 cycles later than IF, the instruction that accesses memory must be placed on a 4-aligned boundary (I said 2-aligned earlier but it was wrong, sorry.)
You should only do that when the code is final because any instruction insertion will mess up your work on contention. But that's something you should know
Citer : Posté le 11/11/2019 20:17 | #
Ajouté le 12/11/2019 à 08:46 :
Well I got an unoptimized N*M 64bit multiplier working
Even supports signed numbers!
Will have to work on a squarer afterwards N^2
The decimal point is 6bits right from the left 6:58 Int:Frac
so 0x04000000 = 1
Let me try to optimize it first please? :P
mandel.c
// nH, nL, mH, mL, Low, Mid, High, BelowLow
mul64(-0x04000000, -0x04000000, -0x04000000, -0x04000000, &out1, &out2, &out3, &out4);
mandelasm.src
; r4, r5, r6, r7, r8, r9, 10, 11
; 12 * 34 = 10*30 + 2*30 + 10*4 + 2*4 = 300 + 60 + 40 + 8 = 408
;nHnL * mHmL = nH*mH + nL*mH + nH*mL + nL*mL =
;Decimal point is 6bits in from the left 6:58 Int:Frac
mov.l r8, @-r15
mov.l r9, @-r15
mov.l r10, @-r15
mov.l r11, @-r15
mov #0, r1
mov #0, r2
mov #0, r3
mov #0, r8
dmuls.l r5, r7 ;nL * mL
sts macl, r0
add r0, r8
sts mach, r0
add r0, r1
clrt
dmuls.l r5, r6 ;nL * mH
sts macl, r0
addc r0, r1
sts mach, r0
addc r0, r2
movt r0
add r0, r3
clrt
dmuls.l r4, r7 ;nH * mL
sts macl, r0
addc r0, r1
sts mach, r0
addc r0, r2
movt r0
add r0, r3
clrt
dmuls.l r4, r6 ;nH * mH
sts macl, r0
addc r0, r2
sts mach, r0
addc r0, r3
;XXXXXXXX YYYYYYYY ZZZZZZZZ WWWWWWWW
; XXXXXX XXYYYYYY YYZZZZZZ ZZWWWWWW WW
;Not to scale. Example is shifted 8bits, while actual is 6bits
mov.l #6, r0
mov.l #-26, r9 ;-(32 - 6)
mov.l r3, r6
shad r0, r3 ;XXXXXX
shad r9, r6 ; XX
mov.l r2, r5
shad r0, r2 ;YYYYYY
shad r9, r5 ; YY
mov.l r1, r4
shad r0, r1 ;ZZZZZZ
shad r9, r4 ; ZZ
mov.l r8, r7
shad r0, r8 ;WWWWWW
shad r9, r7 ; WW
;add #0, r3
add r6, r2
add r5, r1
add r4, r8
;add #0, r7
mov.l @(16,r15), r4
mov.l @(20,r15), r5
mov.l @(24,r15), r6
mov.l @(28,r15), r7
mov.l r3, @r6 ;High
mov.l r2, @r5 ;Mid
mov.l r1, @r4 ;Low
mov.l r8, @r7 ;BelowLow
mov.l @r15+, r11
mov.l @r15+, r10
mov.l @r15+, r9
mov.l @r15+, r8
rts
nop ;Nothing is optimized (yet)
mandelasm.src
mov.l r8, @-r15
mov.l r9, @-r15
mov.l r10, @-r15
clrt
addc r5, r7 ;nL + mL
addc r4, r6 ;nH + mH + Carry
mov.l @(12,r15), r0 ;out1
mov.l @(16,r15), r1 ;out2
mov.l @(20,r15), r2 ;out3
mov.l r7, @r0 ;Low
mov.l r6, @r1 ;High
movt r5
mov.l r5, @r2 ;Carry/Overflow
mov.l @r15+, r10
mov.l @r15+, r9
mov.l @r15+, r8
rts
nop
Citer : Posté le 12/11/2019 09:10 | #
Alright, here's a tip first. Do you know Karatsuba's formula? It's a multiplication trick that lets you compute the product with 3 multiplications instead of 4. I think this would be the first, algorithmic optimization here
Also, are you sure you need the bottom 32 bits (out4)? In a 6:58 × 6:58 product, you get a 12:116 number then you drop 6 bits at the top and 58 bits at the bottom. So out4 is off-limits of the final result.
Hmm... I don't think this is true. A 64-bit integer might be negative but still have a zero at bit 31. If you want to represent -1 in you format this would be -0x04000000, -1, right?
As usual, you can spare time by doing stuff while the multiplier works. Also you don't have to set r1 to 0 then add r0; just copy.
mov #0, r2
mov #0, r3
mov #0, r8
dmuls.l r5, r7 ;nL * mL
sts macl, r0
add r0, r8
sts mach, r0
add r0, r1
This can easily become
mov #0, r2
mov #0, r3
sts macl, r8
sts mach, r1
The same tricks apply everywhere else. Once you have the result of a multiplication, don't addc just yet: immediately start the next multiplication then addc in the meantime!
Things might not be clear in my head right now but I'm pretty sure you should shift differently, namely 6 bits to the left, then return the top two longwords.
The sum looks good to me. The take-home message is that you can add 64-bit integers with just 2/3 instructions. I think the multiplication can be made shorter, although you do have overhead for the shift
Citer : Posté le 13/11/2019 05:56 | #
But it does need a 33bit multiplier
or extra logic for the last bit
The first dmuls.l r5, r7 wont even get used when I drop the Low and BelowLow (out1 and out4)
all outputs get discarded
I only had it there for debugging
Ajouté le 13/11/2019 à 05:59 :
Wait...
Apart from the carry
there could be a carry from the lowest mul that could come up
eh. doesn't matter, the number is soo small anyways
The speed up would be much better
Citer : Posté le 13/11/2019 06:00 | #
That's correct. You need the lowest mul, you just don't need the macl of that mul