Les membres ayant 30 points peuvent parler sur les canaux annonces, projets et hs du chat.
La shoutbox n'est pas chargée par défaut pour des raisons de performances. Cliquez pour charger.

Forum Casio - Projets de programmation


Index du Forum » Projets de programmation » Mandelbrot Generator
Redcmd Hors ligne Membre Points: 380 Défis: 7 Message

Mandelbrot Generator

Posté le 15/06/2019 10:38

Hello allô

Default 1x zoom takes 7sec
Max zoom takes around 5-10min
It has a max zoom of 2^50: over one Quadrillion!
Going over 2^48 can be rather buggy
This is because numbers are limited to the 8 byte double variables

Attached file is both SH4 and SH3 compatible: MANDEL.G1A

This does need the 'MonochromeLib' libs the code comes with it now

Controls
[-] Zoom out
[+] Zoom in
[F1] Hide/show HUD which contains Cords, Zoom level and Max Iterations. (Heads Up Display)
[F2] Changes colours of camera rectangle: Black, White & Inverted
[AC] Resets screen back to default state
[EXE] Draw set
[EXIT] Stop drawing the Mandelbrot (If it's taking too long)
[MENU] Return to the menu screen
[REPLAY] Move camera rectangle around (Arrow Keys: [LEFT], [RIGHT], [UP], [DOWN])

How can I optimize this code to run faster or zoom in further?

#include "fxlib.h"
#include "stdio.h"

#define TRUE 1
#define FALSE 0

#define ML_vram_adress (*(sc_cpv)sc0135)

typedef enum { ML_TRANSPARENT = -1, ML_WHITE, ML_BLACK, ML_XOR, ML_CHECKER } ML_Color;
typedef char* (*sc_cpv)(void);
const unsigned int sc0135[] = { 0xD201D002, 0x422B0009, 0x80010070, 0x0135 };

unsigned int key;                //pause until key press
int kcode1, kcode2;                //row & col keycode for Bkey_GetKeyWait()
char unused;                    //unused (cause CASIO dumb dumb)
unsigned short dispX, dispY;    //cords on display when drawing mandelbrot

void ML_clear_vram() {
    int i, end, * pointer_long, vram;
    char* pointer_byte;
    vram = (int)ML_vram_adress();
    end = 4 - vram & 3;
    pointer_byte = (char*)vram;
    for (i = 0; i < end; i++) pointer_byte[i] = 0;
    pointer_long = (int*)(vram + end);
    for (i = 0; i < 255; i++) pointer_long[i] = 0;
    pointer_byte += 1020 + end;
    end = vram & 3;
    for (i = 0; i < end; i++) pointer_byte[i] = 0;
}

void ML_display_vram() {
    char* LCD_register_selector = (char*)0xB4000000, * LCD_data_register = (char*)0xB4010000, * vram;
    int i, j;
    vram = ML_vram_adress();
    for (i = 0; i < 64; i++) {
        *LCD_register_selector = 4;
        *LCD_data_register = i | 192;
        *LCD_register_selector = 4;
        *LCD_data_register = 0;
        *LCD_register_selector = 7;
        for (j = 0; j < 16; j++)
            *LCD_data_register = *vram++;
    }
}

void ML_display_vram_row(int row) {            //faster than ML_display_vram() which displays the entire screen instead of a single row
    unsigned char i;
    char* LCD_register_selector = (char*)0xB4000000, *LCD_data_register = (char*)0xB4010000, *vram;
    vram = (row << 4) + ML_vram_adress();
    *LCD_register_selector = 4;
    *LCD_data_register = row | 192;
    *LCD_register_selector = 4;
    *LCD_data_register = 0;
    *LCD_register_selector = 7;
    for (i = 0; i < 16; i++)
        * LCD_data_register = *vram++;
}

void ML_horizontal_line(int y, int x1, int x2, ML_Color color) {
    int i;
    char checker;
    char* vram = ML_vram_adress();
    if (y & ~63 || (x1 < 0 && x2 < 0) || (x1 > 127 && x2 > 127))
        return;
    if (x1 > x2) {
        i = x1;
        x1 = x2;
        x2 = i;
    }
    if (x1 < 0)
        x1 = 0;
    if (x2 > 127)
        x2 = 127;
    switch (color) {
        case ML_BLACK:
            if (x1 >> 3 != x2 >> 3) {
                vram[(y << 4) + (x1 >> 3)] |= 255 >> (x1 & 7);
                vram[(y << 4) + (x2 >> 3)] |= 255 << 7 - (x2 & 7);
                for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
                    vram[(y << 4) + i] = 255;
            } else
                vram[(y << 4) + (x1 >> 3)] |= (255 >> (x1 % 8 + 7 - x2 % 8)) << (7 - (x2 & 7));
            break;
        case ML_WHITE:
            if (x1 >> 3 != x2 >> 3) {
                vram[(y << 4) + (x1 >> 3)] &= 255 << 8 - (x1 & 7);
                vram[(y << 4) + (x2 >> 3)] &= 255 >> 1 + (x2 & 7);
                for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
                    vram[(y << 4) + i] = 0;
            } else
                vram[(y << 4) + (x1 >> 3)] &= (255 << 8 - (x1 & 7)) | (255 >> 1 + (x2 & 7));
            break;
        case ML_XOR:
            if (x1 >> 3 != x2 >> 3) {
                vram[(y << 4) + (x1 >> 3)] ^= 255 >> (x1 & 7);
                vram[(y << 4) + (x2 >> 3)] ^= 255 << 7 - (x2 & 7);
                for (i = (x1 >> 3) + 1; i < (x2 >> 3); i++)
                    vram[(y << 4) + i] ^= 255;
            } else
                vram[(y << 4) + (x1 >> 3)] ^= (255 >> ((x1 & 7) + 7 - (x2 & 7))) << (7 - (x2 & 7));
            break;
        case ML_CHECKER:
            checker = (y & 1 ? 85 : 170);
            if (x1 >> 3 != x2 >> 3) {
                vram[(y << 4) + (x1 >> 3)] &= 255 << 8 - (x1 & 7);
                vram[(y << 4) + (x2 >> 3)] &= 255 >> 1 + (x2 & 7);
                vram[(y << 4) + (x1 >> 3)] |= checker & 255 >> (x1 & 7);
                vram[(y << 4) + (x2 >> 3)] |= checker & 255 << 7 - (x2 & 7);
                for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
                    vram[(y << 4) + i] = checker;
            } else {
                vram[(y << 4) + (x1 >> 3)] &= (255 << 8 - (x1 & 7)) | (255 >> 1 + (x2 & 7));
                vram[(y << 4) + (x1 >> 3)] |= checker & (255 >> (x1 % 8 + 7 - x2 % 8)) << (7 - (x2 & 7));
            }
            break;
    }
}

void ML_vertical_line(int x, int y1, int y2, ML_Color color) {
    int i, j;
    char checker, byte, * vram = ML_vram_adress();
    if (x & ~127 || (y1 < 0 && y2 < 0) || (y1 > 63 && y2 > 63)) return;
    if (y1 > y2) {
        int tmp = y1;
        y1 = y2;
        y2 = tmp;
    }
    if (y1 < 0) y1 = 0;
    if (y2 > 63) y2 = 63;

    i = (y1 << 4) + (x >> 3);
    j = (y2 << 4) + (x >> 3);
    switch (color) {
        case ML_BLACK:
            byte = 128 >> (x & 7);
            for (; i <= j; i += 16)
                vram[i] |= byte;
            break;
        case ML_WHITE:
            byte = ~(128 >> (x & 7));
            for (; i <= j; i += 16)
                vram[i] &= byte;
            break;
        case ML_XOR:
            byte = 128 >> (x & 7);
            for (; i <= j; i += 16)
                vram[i] ^= byte;
            break;
        case ML_CHECKER:
            byte = 128 >> (x & 7);
            checker = y1 & 1 ^ x & 1;
            for (; i <= j; i += 16) {
                if (checker) vram[i] &= ~byte;
                else vram[i] |= byte;
                checker = !checker;
            }
            break;
    }
}

void ML_pixel(int x, int y, ML_Color color) {
    char* vram = ML_vram_adress();
    if (x & ~127 || y & ~63) return;
    switch (color) {
        case ML_BLACK:
            vram[(y << 4) + (x >> 3)] |= 128 >> (x & 7);
            break;
        case ML_WHITE:
            vram[(y << 4) + (x >> 3)] &= ~(128 >> (x & 7));
            break;
        case ML_XOR:
            vram[(y << 4) + (x >> 3)] ^= 128 >> (x & 7);
            break;
        case ML_CHECKER:
            if (y & 1 ^ x & 1) vram[(y << 4) + (x >> 3)] &= ~(128 >> (x & 7));
            else vram[(y << 4) + (x >> 3)] |= 128 >> (x & 7);
            break;
    }
}

double divByPow(double n, double x, int p) {        //Divide OR Times n by x, p times (n / x^p): used for numbers bigger than 2^32 (int limit)
    if (p < 0)
        for (; p < 0; p++)
            n *= x;
    else
        for (; p > 0; p--)
            n /= x;
    return n;
}

void stop(void) {            //stops drawing set if user presses [EXIT] or [MENU]
    if (Bkey_GetKeyWait(&kcode1, &kcode2, 1, 0, 1, &unused))
        if (kcode1 == 4 && (kcode2 == 8 || kcode2 == 9)) {
            dispX = 128;    //Very hacky stop function
            dispY = 64;
        }
}

int AddIn_main(int isAppli, unsigned short OptionNum) {        //Main function
    unsigned int graphZoom = 1;                //zoom level for graph
    char screenZoom;                        //zoom level on screen (rectangle)
    int screenX1, screenX2;                    //corner X cords for drawing rectangle to screen
    int screenY1, screenY2;                    //corner Y cords for drawing rectangle to screen
    unsigned char string[1];                //Used in converting int/double to char
    char HUD = TRUE;                        //Heads Up Display: Cords, Zoom level & Max iteration: toggle with [F1]
    char colour = ML_XOR;                    //Colour of rectangle: Black, White or Inverted
    int screenX, screenY;                    //offset cords on screen from 0,0 for rectangle
    double graphX = 0, graphY = 0;            //cords on graph - where to center mandelbrot
    double graphMove;                        //amount graphX & Y changes by when moving rectangle around
    int screenMove;                            //amount screenX & Y changes by when moving rectangle around with arrow keys
    short tempPixel = 0;                    //Write pixels to temp variable then write the entire 2bytes to VRAM all at once

    register double zr, zi;                    //zr is real, zi imaginary
    register double zr2, zi2;                //zr2 = zr^2, zi2 = zi^2
    register double x1 = -2.0;                //bounding box cords on graph
    register double x2 = 2.0;                //bounding box cords on graph
    register double y1 = -1.0;                //bounding box cords on graph
    register double y2 = 1.0;                //bounding box cords on graph
    register double x, y;                    //pixel cords on graph tested if in set
    register double xIsz, yIsz;                //amount x/y increases by when ploting graph
    register unsigned short iMax = 32;        //max iterations
    register unsigned short i;                //iterations

    while (TRUE) {
        register char* vram = ML_vram_adress();

        SetTimer(1, 200, stop);
        ML_clear_vram();
        ML_display_vram();

        xIsz = (x2 - x1) / 128;
        yIsz = (y2 - y1) / 64;

        y = y1;
        for (dispY = 0; dispY < 64; dispY++) {
            x = x1;
            y += yIsz;
            for (dispX = 0; dispX < 128; dispX++) {
                zr = x;
                zi = y;
                for (i = 0; i < iMax; i++) {
                    zr2 = zr * zr;
                    zi2 = zi * zi;
                    if (zr2 + zi2 > 4)
                        break;
                    zi = zr * zi;
                    zi += zi + y;
                    zr = zr2 - zi2 + x;
                }
                tempPixel = (tempPixel << 1) | (i == iMax);
                if ((dispX & 7) == 7)
                    *vram++ = tempPixel;
                x += xIsz;
            }
            ML_display_vram_row(dispY);
        }
        SaveDisp(1);
        KillTimer(1);
        screenX = 0;
        screenY = 0;
        screenZoom = 1;
        Bkey_GetKeyWait(&kcode1, &kcode2, 2, 1, 1, &unused);
        do {
            GetKey(&key);
            screenMove = screenZoom > 4 ? 1 : divByPow(16, 2, screenZoom);
            graphMove = screenZoom > 4 ? divByPow(1, 2, graphZoom - (double)screenZoom) : divByPow(16, 2, graphZoom);
            switch (key) {
                case KEY_CHAR_PLUS:
                    if (graphZoom < 51) {
                        graphZoom++;
                        screenZoom++;
                    }
                    break;
                case KEY_CHAR_MINUS:
                    if (graphZoom) {
                        graphZoom--;
                        screenZoom--;
                    }
                    break;
                case KEY_CTRL_UP:
                    screenY -= screenMove;
                    graphY -= graphMove;
                    break;
                case KEY_CTRL_DOWN:
                    screenY += screenMove;
                    graphY += graphMove;
                    break;
                case KEY_CTRL_LEFT:
                    screenX -= screenMove;
                    graphX -= graphMove;
                    break;
                case KEY_CTRL_RIGHT:
                    screenX += screenMove;
                    graphX += graphMove;
                    break;
                case KEY_CTRL_F1:
                    HUD = !HUD;
                    break;
                case KEY_CTRL_F2:
                    if (colour)
                        colour--;
                    else
                        colour = ML_XOR;
                    break;
                case KEY_CTRL_F3:
                    //Gray scale, by refreshing screen multiple times per sec at different max iterations (iMax)
                    break;
                case KEY_CTRL_AC:
                    graphZoom = 1;
                    graphX = 0;
                    graphY = 0;
                    screenZoom = 1;
                    screenX = 0;
                    screenY = 0;
                    key = KEY_CTRL_EXE;
                    break;
            }
            RestoreDisp(1);
            iMax = 8 * (graphZoom + 3);

            if (screenZoom < 8) {
                screenX1 = 65 - divByPow(128, 2, screenZoom) + screenX;
                screenX2 = 62 + divByPow(128, 2, screenZoom) + screenX;
                screenY1 = 32 - (screenZoom > 6 ? 1 : divByPow(64, 2, screenZoom)) + screenY;
                screenY2 = 31 + (screenZoom > 6 ? 0 : divByPow(64, 2, screenZoom)) + screenY;
                ML_horizontal_line(screenY1, screenX1, screenX2, colour);
                ML_horizontal_line(screenY2, screenX1, screenX2, colour);
                ML_vertical_line(screenX1 - 1, screenY1, screenY2, colour);
                ML_vertical_line(screenX2 + 1, screenY1, screenY2, colour);
            } else
                ML_pixel(screenX + 64, screenY + 31, colour);

            x1 = divByPow(-4, 2, graphZoom) + (0.03125 * graphX);
            x2 = divByPow(4, 2, graphZoom) + (0.03125 * graphX);
            y1 = divByPow(-2, 2, graphZoom) + (0.03125 * graphY);
            y2 = divByPow(2, 2, graphZoom) + (0.03125 * graphY);

            if (HUD == TRUE) {
                sprintf(&string, "X1:%f", x1);
                PrintMini(0, 0, string, 0);
                sprintf(&string, "Y1:%f", y1);
                PrintMini(0, 6, string, 0);
                sprintf(&string, "X2:%f", x2);
                PrintMini(81, 53, string, 0);
                sprintf(&string, "Y2:%f", y2);
                PrintMini(81, 59, string, 0);
                sprintf(&string, "MaxI:%u", iMax);
                PrintMini(0, 53, string, 0);
                if (graphZoom > 32)
                    sprintf(&string, "Zoom:2^%ux", graphZoom - 1);
                else
                    sprintf(&string, "Zoom:%ux", (int)divByPow(1, 2, -graphZoom + 1));
                PrintMini(0, 59, string, 0);
            }

            ML_display_vram();

        } while (key != KEY_CTRL_EXE);
    }
    return 0;
}


#pragma section _BR_Size
unsigned long BR_Size;
#pragma section
#pragma section _TOP
int InitializeSystem(int isAppli, unsigned short OptionNum) {
    return INIT_ADDIN_APPLICATION(isAppli, OptionNum);
}
#pragma section


Fichier joint


Précédente 1, 2, 3, 4, 5, 6, 7, 8 Suivante
Lephenixnoir Hors ligne Administrateur Points: 24668 Défis: 170 Message

Citer : Posté le 08/11/2019 23:09 | #


Of course, this is the normal pass-by-value mechanism.

You can either return a value in r0:

_add50: # int add50(int)
    add #50, r4
    rts
    mov r4, r0

Or you can take a pointer as argument:

_add50: # void add50(int *)
    mov.l @r4, r0
    add #50, r0
    rts
    mov.l r0, @r4

In both of these exemples the mov that follows the rts is executed before the function returns due do the delay slot mechanism.
Mon graphe (11 Avril): ((Rogue Life || HH2) ; PythonExtra ; serial gint ; Boson X ; passe gint 3 ; ...) || (shoutbox v5 ; v5)
Redcmd Hors ligne Membre Points: 380 Défis: 7 Message

Citer : Posté le 08/11/2019 23:20 | #


I tried both ways and it doesn't seem to do anything

mandel.lst

        Mandel.c    61            int testVar = 27;
    0000010C E31B                   MOV         #27,R3

        Mandel.c    67            while (1) {
    00000118              L341:                  

        Mandel.c    70                    foo(&testVar);
    0000011E 64F3                   MOV         R15,R4
    00000120 D336                   MOV.L       L351+6,R3  ; _foo
    00000122 1F03                   MOV.L       R0,@(12,R15)
    00000124 430B                   JSR         @R3
    00000126 7408                   ADD         #8,R4

//some other code is here, printing testVar to screen

    000001F2 AF91                   BRA         L341


mandelsrc.src
.EXPORT _foo

.ALIGN    4

_foo:
    add #5, r4
    rts
    mov r4, r0
    
.ALIGN    4

.END



Ajouté le 08/11/2019 à 23:27 :
oops was modifying the wrong file
Im using Microsoft visual studio to edit the file (has syntax highlighting etc)
and when I renamed the .src file, I forgot to update it in visual studio

working now
Lephenixnoir Hors ligne Administrateur Points: 24668 Défis: 170 Message

Citer : Posté le 09/11/2019 08:38 | #


Glad to see you've got it sorted out.
Mon graphe (11 Avril): ((Rogue Life || HH2) ; PythonExtra ; serial gint ; Boson X ; passe gint 3 ; ...) || (shoutbox v5 ; v5)
Redcmd Hors ligne Membre Points: 380 Défis: 7 Message

Citer : Posté le 09/11/2019 10:08 | #


Does the SDK list the code in the .src file in the .lst file?
because I dont see any of my code in it
theres no mul codes in the .lst file, while my .src file has heaps of them
I wanna compare the amount of codes the SDK has vs how many I have, but it doesn't have the final converted codes

mandel.lst
     Mandel.c    85                   innerLoop(&zr, &zi, &zr2, &zi2);
    0000015C D22A                   MOV.L       L347+24,R2 ; _innerLoop
    0000015E 67F3                   MOV         R15,R7
    00000160 770C                   ADD         #12,R7
    00000162 66F3                   MOV         R15,R6
    00000164 7608                   ADD         #8,R6
    00000166 65F3                   MOV         R15,R5
    00000168 64F3                   MOV         R15,R4
    0000016A 420B                   JSR         @R2
    0000016C 7404                   ADD         #4,R4

mandelsrc.src
_innerLoop:
    mov.l   #-13,   r3
    mov.l   @r4,    r0
    shad    r3,     r0
    mov.l   r0,     @r4
    mov.l   @r5,    r1
    shad    r3,     r1
    mov.l   r1,     @r5
    muls.l  r0,     r0
    sts     macl,   r0
    mov.l   r0,     @r6
    muls.l  r1,     r1
    sts     macl,   r1
    mov.l   r1,     @r7
    rts
    nop

Lephenixnoir Hors ligne Administrateur Points: 24668 Défis: 170 Message

Citer : Posté le 09/11/2019 10:21 | #


This is because you function cannot be inlined. As you can see, the first line of the listing loads the address of innerLoop into r2 and the second-to-last line jumps to r2 as a subroutine call with jsr. So you assembler function is being called as a normal function.

It is not easy for a compiler to inline assembler functions and it is very probably beyond the capabilities of the SDK's compiler. If you want the code that calls the inner loop to be in the same function as the code that executes it, you need to write both in assembler. Eventually you should reach a level where paying a function call has little to no performance impact and you can stop moving things to assembler code.
Mon graphe (11 Avril): ((Rogue Life || HH2) ; PythonExtra ; serial gint ; Boson X ; passe gint 3 ; ...) || (shoutbox v5 ; v5)
Redcmd Hors ligne Membre Points: 380 Défis: 7 Message

Citer : Posté le 09/11/2019 10:29 | #


does @(8,R15) mean postion 8 in register 15? (AKA the stack)
(MOV.L R2,@(8,R15))

is 'mov' the only code that can handle #13, #H'13 and @r4 values vs just r0 - r14

Lephenixnoir a écrit :
If you want the code that calls the inner loop to be in the same function as the code that executes it, you need to write both in assembler. Eventually you should reach a level where paying a function call has little to no performance impact and you can stop moving things to assembler code

So I write the entire critical loop in assembly?
and the single function call to start it (all 3 for loops) shouldn't cause any performace impact
Lephenixnoir Hors ligne Administrateur Points: 24668 Défis: 170 Message

Citer : Posté le 09/11/2019 10:57 | #


does @(8,R15) mean postion 8 in register 15? (AKA the stack)

It means the data in memory at address (r15+8). Note that 8 is a number of bytes, so if you accidentally try fi. mov.l @(2,r15),r0, this will fail because r15+2 is 2-aligned.

is 'mov' the only code that can handle #13, #H'13 and @r4 values vs just r0 - r14

Not many instructions support directly accessing pointer memory (@ mode). Many instruction support immediate operands (# mode), but often you have to use them with r0. r0 is a bit special because it is used as a "special-purpose register" when instructions do not support all registers.

For instance, you can write or #2, r0 but not or #2, r1.

So I write the entire critical loop in assembly?
and the single function call to start it (all 3 for loops) shouldn't cause any performace impact

Yes, you can proceed this way.
Mon graphe (11 Avril): ((Rogue Life || HH2) ; PythonExtra ; serial gint ; Boson X ; passe gint 3 ; ...) || (shoutbox v5 ; v5)
Redcmd Hors ligne Membre Points: 380 Défis: 7 Message

Citer : Posté le 10/11/2019 01:39 | #


How to safely read and write to r15 without overwriting anything?
and how to access the 5th 6th etc variables from r15?
Whats @-R15?
MOV.L       R0,@-R15


how freely can I use r5-r7 and r8-r14?

is using a short instead of a int slower?
because I noticed this bit of code only shows when its a short
EXTU.W      R6,R3


it was working fine for ages
then all of a sudden Im getting this error
even if the mandel.asm file is empty
    WARNING: The following dependant file(s) do not exist: "D:\Documents\CASIO\fx-9860G SDK\Mandel\Debug\Mandelassmbly.obj". Line: 102
"D:\Documents\CASIO\fx-9860G SDK\_fx-9860G SDK\OS\SH\bin\Optlnk.exe" -subcommand=C:\Users\RedCMD\AppData\Local\Temp\hmk35AE.tmp

** L3300 (F) Cannot open file : "D:\Documents\CASIO\fx-9860G SDK\Mandel\Debug\Mandelassmbly.obj"

Lephenixnoir Hors ligne Administrateur Points: 24668 Défis: 170 Message

Citer : Posté le 10/11/2019 10:48 | #


How to safely read and write to r15 without overwriting anything?

Since r15 is the stack pointer, its value should never really change except by small amounts or making space for local variables. It should always be 4-aligned because interrupt handlers will assume this.

and how to access the 5th 6th etc variables from r15?

The first longword at the top of the stack is @r15, the second one is @(4,r15), the third is @(8,r15) and so on.

mov.l r0,@-r15 decrements r15 by 4 bytes then writes r0 at the resulting address. This is, in essence, a stack push. Be careful as this changes the offsets (the variable at @(4,r15) is now at @(8,r15), which is easy to forget).

how freely can I used r5-r7 and r8-r14?

You can freely use r0 to r7 provided that you don't need their values (ie. if you trash the arguments then don't complain). For r8 to r14, you need to save them and restore them at the end of the function, typically like this:

_function:
  mov.l r8, @-r15
  mov.l r9, @-r15

  # Do things...
  mov r4, r8
  mov #0, r9

  # Restore and leave
  mov.l @r15+, r9
  rts
  mov.l @r15+, r8

is using a short instead of a int slower?

Using shorts is not slower by essence, but when you load a short from memory only the low 16 bits of the target register are updated. For instance:

mov #-1, r0 # r0 = 0xffffffff
mov #4, r1
mov.w r1, @r5
mov.w @r5, r0 # r0 = 0xffff0004

The extu.w instructions means ExTend Unsigned from Word and fills the high 16 bits with zeros. This is necessary unless you can prove that the high 16 bits are already zero. This is a trivial instruction that executes fast.

On SH4, when you are using an array, it is better to use an array of shorts and use this instruction because the array will be much smaller so you will have less cache misses. Generally memory is the bottleneck, I'd say you need 4/5 CPU cycles to equate a memory cycle, although don't take my word on that.

then all of a sudden Im getting this error

Maybe try to Rebuild All (Project » Rebuild All) or delete the Debug folder and rebuild.
Mon graphe (11 Avril): ((Rogue Life || HH2) ; PythonExtra ; serial gint ; Boson X ; passe gint 3 ; ...) || (shoutbox v5 ; v5)
Redcmd Hors ligne Membre Points: 380 Défis: 7 Message

Citer : Posté le 10/11/2019 10:56 | #


Thanks for all your help
even tho I havn't done anything in return

still getting the error
If I rename to mandel.src it works, but then theres no syntax highlighting in visual studio
and now its working again :|

when I reinstall the SDK, it still remembers what my recent projects were
is there some cache it hides somewhere on my computer?
Lephenixnoir Hors ligne Administrateur Points: 24668 Défis: 170 Message

Citer : Posté le 10/11/2019 11:01 | #


Don't mention it, that's what this place is for

still getting the error
If I rename to mandel.src it works, but then theres no syntax highlighting in visual studio

Looks like the file names in your project are a bit messed up. According to the log the SDK seems to expect Mandelassmbly.src.

Regarding the extension, you don't have much of a choice because AFAIK if your file is not named .src the SDK will not try to compile it as assembler. I would have suggested to use a symbolic link if such a thing existed...

Recent project is definitely in a sort of cache, but I don't know where. It might be in the Windows registry. If not, then check the SDK files (typically C:\Program Files\CASIO\fx-9860G SDK) just in case something happens to look like it.
Mon graphe (11 Avril): ((Rogue Life || HH2) ; PythonExtra ; serial gint ; Boson X ; passe gint 3 ; ...) || (shoutbox v5 ; v5)
Redcmd Hors ligne Membre Points: 380 Défis: 7 Message

Citer : Posté le 10/11/2019 11:06 | #


well its working now
the symbolic link works fine
after unistalling the SDK (a few days ago, not right now)
I went and search my entire disk for all casio related files (just by doing a CTRL+F "casio") and also looking at the most common places where it could be
So it must be in registry then

Mandelassmbly.src.
how did you do the mono-spaced font?
Lephenixnoir Hors ligne Administrateur Points: 24668 Défis: 170 Message

Citer : Posté le 10/11/2019 11:08 | #


Well, as long as it works, I guess it's fine...

For the code you can use the backtick ` (U+0060) or the [inlinecode] tag (useful if you are trying to write literal backticks).
Mon graphe (11 Avril): ((Rogue Life || HH2) ; PythonExtra ; serial gint ; Boson X ; passe gint 3 ; ...) || (shoutbox v5 ; v5)
Redcmd Hors ligne Membre Points: 380 Défis: 7 Message

Citer : Posté le 11/11/2019 04:31 | #


SDK doesn't update the mandelasm.src file when I edit it from visual basic. - maybe cause Im using a symlink? (mandelsrc.asm)
reload and rebuild all doesn't work, have to restart the SDK
it restarts in less than a sec, so isn't too bad
even removing and re-adding the file doesn't work
EDIT
rebuild all gets the correct updated file, while the SDK still thinks the file hasn't been changed - so I don't have to restart it
Is there way to map build to rebuild all?

I got the entire critical loop code converted into assembly
its also about 14% faster too
Gonna work on making it 64bits now

mandelasm.src
.EXPORT _mainLoop

.ALIGN    4


_mainLoop:                      ;mainLoop(xStart, yStart, Isz, *vram, iMax)  //row, col, x, y, zr2, zi2
                                ;             r4,     r5,  r6,    r7,   r8   // r9, r10,11,12,  13,  14
                         ;temp, zr, zi, const, tempPixel
                         ;  r0, r1, r2,    r3,        r5
    mov.l   r8,         @-r15
    mov.l   r9,         @-r15
    mov.l   r10,        @-r15
    mov.l   r11,        @-r15
    mov.l   r12,        @-r15
    mov.l   r13,        @-r15
    mov.l   r14,        @-r15
    mov.l   #64,        r9      ;row = 64
    mov.l   r5,         r12     ;y = yStart
    mov.l   #0,         r5      ;tempPixel = 0      //r5 not yStart anymore

_row:                           ;for (row = 64; row > 0; row--) {
    mov.l   #128,       r10     ;col = 128
    mov.l   r4,         r11     ;x = xStart
    add     r6,         r12     ;y += Isz

_col:                           ;for (col = 128; col > 0; col--) {
    mov.l   r11,        r1      ;zr = x
    mov.l   r12,        r2      ;zi = y
    mov.l   @(28,r15),  r8      ;i = iMax

_innerLoop:                     ;for (i = iMax; i > 0; i--) {
    mov.l   #-13,       r3
    shad    r3,         r1      ;zr >>= 13
    shad    r3,         r2      ;zi >>= 13

    mul.l   r1,         r1      ;zr * zr
    sts     macl,       r13     ;zr2 = zr * zr

    mul.l   r2,         r2      ;zi * zi
    sts     macl,       r14     ;zi2 = zi * zi
    
    mov.l   #268435456, r3      ;0x10000000
    mov.l   r13,        r0
    add     r14,        r0
    cmp/gt  r3,         r0      ;if (zr2 + zi2 > 0x10000000)
    bt      _exitInnerLoop      ;break
    nop

    mul.l   r1,         r2
    sts     macl,       r2      ;zi *= zr

    shll    r2
    add     r12,        r2      ;zi += zi + y

    sub     r14,        r13     ;zr2 - zi2
    add     r11,        r13     ; + x
    mov.l   r13,        r1      ;zr = zr2 - zi2 + x

    dt      r8                  ;if(i = 0)
    bf      _innerLoop          ;then break; else jmp _innerLoop
    nop

_exitInnerLoop:
    shll    r5                  ;tempPixel << 1
    tst     r8,         r8      ;i == 0
    movt    r0
    or      r0,         r5      ;tempPixel = (tempPixel << 1) | (i == 0)

    mov.l   r10,        r0      ;col
    and     #7,         r0      ;col & 7
    cmp/eq  #1,         r0      ;if((col & 7) == 1)
    bf      _bypassVRAM
    nop

    mov.b   r5,         @r7     ;*vram = tempPixel
    add     #1,         r7      ;vram++
    
_bypassVRAM:
    add     r6,         r11     ;x += Isz

    dt      r10                 ;if(col = 0)
    bf      _col                ;then break; else jmp _col
    nop

    dt      r9                  ;if(row = 0)
    bf      _row                ;then break; else jmp _row
    nop

;_end:
    mov.l   @r15+,      r14
    mov.l   @r15+,      r13
    mov.l   @r15+,      r12
    mov.l   @r15+,      r11
    mov.l   @r15+,      r10
    mov.l   @r15+,      r9
    mov.l   @r15+,      r8
    rts
    nop


.ALIGN    4

.END


Is it safe to have 0x88004991 in mainLoop(xStart, yStart, Isz, 0x88004991, iMax)? (mandel.c)
the number comes from *vram, tho I couldn't figure out how to place it in there
got super confused with double pointers, arrays and memory swapping
mandel.c
#define ML_vram_adress (*(sc_cpv)sc0135)
typedef char* (*sc_cpv)(void);
const unsigned int sc0135[] = { 0xD201D002, 0x422B0009, 0x80010070, 0x0135 };
char* vram = ML_vram_adress();


What sort of opitmizations could I do?
I can move mov.l @r15+, r8 down to replace the nop, tho I couldn't see any others I could do that to
I think its possible to drop a few of the registers
Lephenixnoir Hors ligne Administrateur Points: 24668 Défis: 170 Message

Citer : Posté le 11/11/2019 09:53 | #


Redcmd a écrit :
SDK doesn't update the mandelasm.src file when I edit it from visual basic. - maybe cause Im using a symlink? (mandelsrc.asm)

As I understand, you are editing a .asm file with an external program. What you must do is update the .src file in some way. Either you can edit it directly. Or you can copy it every time you save it. Or maybe there is a sort of symbolinc link mechansim in Windows that can help you have two different names for the same file.

My advice is to just have one .src file and instruct your text editor to highlight it as assembler. If your editor is sufficiently powerful this should be easy to do. My reasoning is that it is easier to bend a powerful text editor than to bend the limited SDK.

Is there way to map build to rebuild all?

I don't think so, but back when I used the SDK I used to run Rebuild All by keyboard shortcut, something like Alt+P, U IIRC. This might be an acceptable compromise for you.

I got the entire critical loop code converted into assembly
its also about 14% faster too

Good job!

This contributes to showing that the SDK's compiler does not perform a lot of optimizations.

Here are a few comments I hope can help you:

    dt      r9                  ;if(row = 0)
    bf      _row                ;then break; else jmp _row
    nop

The bf and bt insructions are not delay slots. The delay slots equivalents are bf.s and bt.s. This means the nop is currently useless. If possible, you will gain 1 cycle by using bf.s and inserting a useful instruction in the delay slot. The same applies here:

    bt      _exitInnerLoop      ;break
    nop

Except that if the condition is not taken, you will lose one cycle in the inner loop executing a nop, which is sad.

    mov.l   #128,       r10     ;col = 128
    mov.l   r4,         r11     ;x = xStart

I don't know about the SDK, but this syntax is not correct in GCC. The .l suffix is only used to indicate the size of memory accesses. Here you don't have any memory access, and you don't have the choice of the size. The first instruction sign-extends the constant from 8 bits to 32 bits and assigns the entirety of r10. The second instructions copies the entirety of r4 to r11. Both are normally written mov instead of mov.l.

    mul.l   r1,         r1      ;zr * zr
    sts     macl,       r13     ;zr2 = zr * zr

    mul.l   r2,         r2      ;zi * zi
    sts     macl,       r14     ;zi2 = zi * zi

The multiplier needs 3 to 5 cycles to compute its result. If you access the macl or mach register just after multiplying, you will wait 5 cycles. If you can find two instructions to place in-between, you will wait only 3 cycles. In other words there is a pipeline bubble here only waiting to be filled.

    mov.l   #268435456, r3      ;0x10000000

Again, I don't know what the SDK allows, but you cannot directly to this in the instruction set. The SuperH has 16-bit instructions, so it should be obvious that a 32-bit operand cannot fit in. How this works normally is, the constant is placed in memory below the program and the special addressing mode @(disp,pc) is used. This instructs the processor to read the operand a certain distance away from the instruction. In assembly code, a label is normally used. The assembler will turn the label into the appropriate @(disp,pc) command. You should be aware that an additional memory access occurs on this line.

    shll    r5                  ;tempPixel << 1
    tst     r8,         r8      ;i == 0
    movt    r0
    or      r0,         r5      ;tempPixel = (tempPixel << 1) | (i == 0)

There is a cool optimization here.

Instead of shll you can use the rotcl instruction which rotates left. The cool thing about this instruction is, it rotates with T: the highest bit goes to T and T becomes the lowest bit. So you can compute T with the tst instruction then shift and insert it in a single line with:

tst r8, r8
rotcl r5

You mastered the assembler principles really fast. I am impressed

Is it safe to have 0x88004991 in mainLoop(xStart, yStart, Isz, 0x88004991, iMax)? (mandel.c)
the number comes from *vram, tho I couldn't figure out how to place it in there
got super confused with double pointers, arrays and memory swapping

You should not hardcode the VRAM address. Please use the syscall, either as you did in C:

#define ML_vram_adress (*(sc_cpv)sc0135)
typedef char* (*sc_cpv)(void);
const unsigned int sc0135[] = { 0xD201D002, 0x422B0009, 0x80010070, 0x0135 };
char* vram = ML_vram_adress();

Or properly in assembler (the syntax might need to be changed a bit for the SDK):

_ML_vram_address:
  mov.l  syscall, r1
  mov.l  vram_addr, r0
  jmp    @r1
  nop

syscall:
  .long #h'80010070
vram_addr:
  .long #h'00000135


I can move mov.l @r15+, r8 down to replace the nop, tho I couldn't see any others I could do that to
I think its possible to drop a few of the registers

You can do this, but remember that the critical part is the inner loop. Pushing and popping a few registers just once is nothing. I think a decent approach to optimizing the inner loop is reordering instructions to use the pipeline more. You have delays between multiplications and accesses to macl, unused delay slots in jumps, and when the code is really good and won't change anymore (after you finish with 64-bit that is), you can start putting memory accessed on 2-aligned addresses to improve the pipeline flow by reducing memory contention.

This is becoming really interesting. Probably the most optimized Mandelbrot viewer out there. Keep it up
Mon graphe (11 Avril): ((Rogue Life || HH2) ; PythonExtra ; serial gint ; Boson X ; passe gint 3 ; ...) || (shoutbox v5 ; v5)
Redcmd Hors ligne Membre Points: 380 Défis: 7 Message

Citer : Posté le 11/11/2019 20:01 | #


As I understand, you are editing a .asm file with an external program.
Correct. and I'm using a symlink that points to mandelsrc.asm that is called mandelasm.src which is used by the SDK

The bf and bt insructions are not delay slots. The delay slots equivalents are bf.s and bt.s. This means the nop is currently useless.
Removing the nop speeds it up. But if a put a bf.s in instead, I cant see any instruction that could fit in there

The multiplier needs 3 to 5 cycles to compute its result. If you access the macl or mach register just after multiplying, you will wait 5 cycles. If you can find two instructions to place in-between, you will wait only 3 cycles. In other words there is a pipeline bubble here only waiting to be filled.
Am I able to interweave two mul.l codes together?
mul.l  r0,  r0
mul.l  r1,  r1
sts  macl,  r0
sts  macl,  r1


Instead of shll you can use the rotcl instruction which rotates left. The cool thing about this instruction is, it rotates with T: the highest bit goes to T and T becomes the lowest bit. So you can compute T with the tst instruction then shift and insert it in a single line with:
Nice! Thats 50% faster

You mastered the assembler principles really fast. I am impressed
Thank You!

you can start putting memory accessed on 2-aligned addresses to improve the pipeline flow by reducing memory contention.
O.o
Lephenixnoir Hors ligne Administrateur Points: 24668 Défis: 170 Message

Citer : Posté le 11/11/2019 20:11 | #


Redcmd a écrit :
Correct. and I'm using a symlink that points to mandelsrc.asm that is called mandelasm.src which is used by the SDK

As long as it works I don't see a problem. I didn't know that symlinks existed on Windows.

Removing the nop speeds it up. But if a put a bf.s in instead, I cant see any instruction that could fit in there

Well looking at the inner loop there is certainly stuff that can be moved:

    sub     r14,        r13     ;zr2 - zi2
    add     r11,        r13     ; + x
    mov.l   r13,        r1      ;zr = zr2 - zi2 + x

    dt      r8                  ;if(i = 0)
    bf      _innerLoop          ;then break; else jmp _innerLoop
    nop

Why not just this?

    sub     r14,        r13     ;zr2 - zi2
    add     r11,        r13     ; + x

    dt      r8                  ;if(i = 0)
    bf.s    _innerLoop          ;then break; else jmp _innerLoop
    mov.l   r13,        r1      ;zr = zr2 - zi2 + x

Am I able to interweave two mul.l codes together?

Oh no you can't. You should see from the code that r0 and r1 will be equal. The first product result will be lost.

However you do have inpedendent stuff going on:

    mov.l   #-13,       r3
    shad    r3,         r1      ;zr >>= 13
    shad    r3,         r2      ;zi >>= 13

    mul.l   r1,         r1      ;zr * zr
    sts     macl,       r13     ;zr2 = zr * zr

Since the value of r2 is not needed for the first product, you can write for instance:

    mov.l   #-13,       r3
    shad    r3,         r1      ;zr >>= 13

    mul.l   r1,         r1      ;zr * zr
    shad    r3,         r2      ;zi >>= 13
    sts     macl,       r13     ;zr2 = zr * zr

you can start putting memory accessed on 2-aligned addresses to improve the pipeline flow by reducing memory contention.
O.o

Yeah this is pretty elaborate already. You should know that you can execute at most one memory access per CPU cycle (and if it goes to RAM it takes even longer). Memory accesses are already needed to fetch the instructions. However, instructions are 16-bit and the memory bus is 32-bit wide so the processor can read instructions by pairs. This happens when the program counter is 4-aligned.

This means that exactly half of the cycles already have memory accesses before your code can even do anything. If the memory access of one of your mov.l (or similar) instructions falls on the same pipeline slot as an automatic instruction fetch, then contention occurs and one of the accesses will be delayed. Basically you lose a cycle or more.

The trick to avoid this is to arrange your instructions so that their memory access stages (MA) do not happen during instruction fetches (IF) but between them. Since MA occurs 3 cycles later than IF, the instruction that accesses memory must be placed on a 4-aligned boundary (I said 2-aligned earlier but it was wrong, sorry.)

You should only do that when the code is final because any instruction insertion will mess up your work on contention. But that's something you should know
Mon graphe (11 Avril): ((Rogue Life || HH2) ; PythonExtra ; serial gint ; Boson X ; passe gint 3 ; ...) || (shoutbox v5 ; v5)
Redcmd Hors ligne Membre Points: 380 Défis: 7 Message

Citer : Posté le 11/11/2019 20:17 | #


Why not just this?
oops, was thinking about the other branches at the time

something like Alt+P, U
ALT + P, U works

Oh no you can't.
thought so

Ajouté le 12/11/2019 à 08:46 :
Well I got an unoptimized N*M 64bit multiplier working
Even supports signed numbers!
Will have to work on a squarer afterwards N^2
The decimal point is 6bits right from the left 6:58 Int:Frac
so 0x04000000 = 1

Let me try to optimize it first please? :P

mandel.c
        //If n neg, both nH and nL have to be neg (same for m)
        //               nH,            nL,             mH,          mL,   Low,   Mid,     High, BelowLow
        mul64(-0x04000000, -0x04000000, -0x04000000, -0x04000000, &out1, &out2, &out3, &out4);


mandelasm.src
_mul64:                         ;mul64(nH, nL, mH, mL, &out1, &out2, &out3, &out4)
                                ;      r4, r5, r6, r7,    r8,    r9,    10,    11
;  12 * 34    =  10*30 +  2*30 + 10*4  +  2*4   =  300 + 60 + 40 + 8  =  408
;nHnL * mHmL  =  nH*mH + nL*mH + nH*mL + nL*mL  =  
;Decimal point is 6bits in from the left    6:58    Int:Frac
    mov.l   r8,         @-r15
    mov.l   r9,         @-r15
    mov.l   r10,        @-r15
    mov.l   r11,        @-r15
    mov     #0,         r1
    mov     #0,         r2
    mov     #0,         r3
    mov     #0,         r8

    dmuls.l r5,         r7      ;nL * mL
    sts     macl,       r0
    add     r0,         r8
    sts     mach,       r0
    add     r0,         r1

    clrt
    dmuls.l r5,         r6      ;nL * mH
    sts     macl,       r0
    addc    r0,         r1
    sts     mach,       r0
    addc    r0,         r2
    movt    r0
    add     r0,         r3

    clrt
    dmuls.l r4,         r7      ;nH * mL
    sts     macl,       r0
    addc    r0,         r1
    sts     mach,       r0
    addc    r0,         r2
    movt    r0
    add     r0,         r3

    clrt
    dmuls.l r4,         r6      ;nH * mH
    sts     macl,       r0
    addc    r0,         r2
    sts     mach,       r0
    addc    r0,         r3
                                ;XXXXXXXX YYYYYYYY ZZZZZZZZ WWWWWWWW
                                ;  XXXXXX XXYYYYYY YYZZZZZZ ZZWWWWWW WW
                                ;Not to scale. Example is shifted 8bits, while actual is 6bits
    mov.l   #6,         r0
    mov.l   #-26,       r9      ;-(32 - 6)

    mov.l   r3,         r6
    shad    r0,         r3      ;XXXXXX
    shad    r9,         r6      ;      XX
    
    mov.l   r2,         r5
    shad    r0,         r2      ;YYYYYY
    shad    r9,         r5      ;      YY
    
    mov.l   r1,         r4
    shad    r0,         r1      ;ZZZZZZ
    shad    r9,         r4      ;      ZZ
    
    mov.l   r8,         r7
    shad    r0,         r8      ;WWWWWW
    shad    r9,         r7      ;      WW

    ;add     #0,         r3
    add     r6,         r2
    add     r5,         r1
    add     r4,         r8
    ;add     #0,         r7


    mov.l   @(16,r15),  r4
    mov.l   @(20,r15),  r5
    mov.l   @(24,r15),  r6
    mov.l   @(28,r15),  r7
    mov.l   r3,         @r6     ;High
    mov.l   r2,         @r5     ;Mid
    mov.l   r1,         @r4     ;Low
    mov.l   r8,         @r7     ;BelowLow
    mov.l   @r15+,      r11
    mov.l   @r15+,      r10
    mov.l   @r15+,      r9
    mov.l   @r15+,      r8
    rts
    nop                         ;Nothing is optimized (yet)


mandelasm.src
_sum64:                         ;sum64(nH, nL, mH, mL, &out1, &out2, &out3)
    mov.l   r8,         @-r15
    mov.l   r9,         @-r15
    mov.l   r10,        @-r15

    clrt
    addc    r5,         r7      ;nL + mL
    addc    r4,         r6      ;nH + mH + Carry

    mov.l   @(12,r15),  r0      ;out1
    mov.l   @(16,r15),  r1      ;out2
    mov.l   @(20,r15),  r2      ;out3

    mov.l   r7,         @r0     ;Low
    mov.l   r6,         @r1     ;High
    movt    r5
    mov.l   r5,         @r2     ;Carry/Overflow

    mov.l   @r15+,      r10
    mov.l   @r15+,      r9
    mov.l   @r15+,      r8
    rts
    nop
only the addc part will be used in mandel (too many mov.l's )
Lephenixnoir Hors ligne Administrateur Points: 24668 Défis: 170 Message

Citer : Posté le 12/11/2019 09:10 | #


Alright, here's a tip first. Do you know Karatsuba's formula? It's a multiplication trick that lets you compute the product with 3 multiplications instead of 4. I think this would be the first, algorithmic optimization here

Also, are you sure you need the bottom 32 bits (out4)? In a 6:58 × 6:58 product, you get a 12:116 number then you drop 6 bits at the top and 58 bits at the bottom. So out4 is off-limits of the final result.

        //If n neg, both nH and nL have to be neg (same for m)

Hmm... I don't think this is true. A 64-bit integer might be negative but still have a zero at bit 31. If you want to represent -1 in you format this would be -0x04000000, -1, right?

As usual, you can spare time by doing stuff while the multiplier works. Also you don't have to set r1 to 0 then add r0; just copy.

    mov     #0,         r1
    mov     #0,         r2
    mov     #0,         r3
    mov     #0,         r8

    dmuls.l r5,         r7      ;nL * mL
    sts     macl,       r0
    add     r0,         r8
    sts     mach,       r0
    add     r0,         r1

This can easily become

    dmuls.l r5,         r7      ;nL * mL
    mov     #0,         r2
    mov     #0,         r3
    sts     macl,       r8
    sts     mach,       r1

The same tricks apply everywhere else. Once you have the result of a multiplication, don't addc just yet: immediately start the next multiplication then addc in the meantime!

Things might not be clear in my head right now but I'm pretty sure you should shift differently, namely 6 bits to the left, then return the top two longwords.

The sum looks good to me. The take-home message is that you can add 64-bit integers with just 2/3 instructions. I think the multiplication can be made shorter, although you do have overhead for the shift
Mon graphe (11 Avril): ((Rogue Life || HH2) ; PythonExtra ; serial gint ; Boson X ; passe gint 3 ; ...) || (shoutbox v5 ; v5)
Redcmd Hors ligne Membre Points: 380 Défis: 7 Message

Citer : Posté le 13/11/2019 05:56 | #


Do you know Karatsuba's formula?
Funny thing is I read about that a few days ago, tho didnt think much of it at the time
But it does need a 33bit multiplier
or extra logic for the last bit

Also, are you sure you need the bottom 32 bits (out4)? In a 6:58 × 6:58 product, you get a 12:116 number then you drop 6 bits at the top and 58 bits at the bottom. So out4 is off-limits of the final result.
Yea, I hadn't opitmized anything yet. Was trying to get it all working first. Then I'll make it unreadable when optimizing it
Also you don't have to set r1 to 0 then add r0; just copy.
same thing

The first dmuls.l r5, r7 wont even get used when I drop the Low and BelowLow (out1 and out4)
all outputs get discarded
I only had it there for debugging

Ajouté le 13/11/2019 à 05:59 :
Wait...
Apart from the carry
there could be a carry from the lowest mul that could come up
eh. doesn't matter, the number is soo small anyways
The speed up would be much better
Lephenixnoir Hors ligne Administrateur Points: 24668 Défis: 170 Message

Citer : Posté le 13/11/2019 06:00 | #


there could be a carry from the lowest mul that could come up

That's correct. You need the lowest mul, you just don't need the macl of that mul
Mon graphe (11 Avril): ((Rogue Life || HH2) ; PythonExtra ; serial gint ; Boson X ; passe gint 3 ; ...) || (shoutbox v5 ; v5)
Précédente 1, 2, 3, 4, 5, 6, 7, 8 Suivante

LienAjouter une imageAjouter une vidéoAjouter un lien vers un profilAjouter du codeCiterAjouter un spoiler(texte affichable/masquable par un clic)Ajouter une barre de progressionItaliqueGrasSoulignéAfficher du texte barréCentréJustifiéPlus petitPlus grandPlus de smileys !
Cliquez pour épingler Cliquez pour détacher Cliquez pour fermer
Alignement de l'image: Redimensionnement de l'image (en pixel):
Afficher la liste des membres
:bow: :cool: :good: :love: ^^
:omg: :fusil: :aie: :argh: :mdr:
:boulet2: :thx: :champ: :whistle: :bounce:
valider
 :)  ;)  :D  :p
 :lol:  8)  :(  :@
 0_0  :oops:  :grr:  :E
 :O  :sry:  :mmm:  :waza:
 :'(  :here:  ^^  >:)

Σ π θ ± α β γ δ Δ σ λ
Veuillez donner la réponse en chiffre
Vous devez activer le Javascript dans votre navigateur pour pouvoir valider ce formulaire.

Si vous n'avez pas volontairement désactivé cette fonctionnalité de votre navigateur, il s'agit probablement d'un bug : contactez l'équipe de Planète Casio.

Planète Casio v4.3 © créé par Neuronix et Muelsaco 2004 - 2024 | Il y a 49 connectés | Nous contacter | Qui sommes-nous ? | Licences et remerciements

Planète Casio est un site communautaire non affilié à Casio. Toute reproduction de Planète Casio, même partielle, est interdite.
Les programmes et autres publications présentes sur Planète Casio restent la propriété de leurs auteurs et peuvent être soumis à des licences ou copyrights.
CASIO est une marque déposée par CASIO Computer Co., Ltd