;
; Jaguar Example Source Code
; Jaguar Workshop Series #12
; Copyright (c)1994 Atari Corp.
; ALL RIGHTS RESERVED
;
; Program: jr.cof       - Blitter Bitmap Rotation
;  Module: jr_grot.s    - Rotate a bitmap by setting the Blitter with the GPU
;
; Revision History:
;  7/27/94 - SDS: Brought over from Eric S's JAGROT code.
;  8/02/94 - SDS: Module first working that will take any src/dest bitmap.
;
;----------------------------------------------------------------------------
; Parameters are passed in hi ram, as follows:
;
; NUMPLANES     (G_PARM1)       Number of planes for source and destination.
; G_PARM2       (G_PARM2)       Angle of rotation (2048 = 360 degrees).
; G_PARM3       (G_PARM3)       Address of source bitmap.
; G_PARM4       (G_PARM4)       Width of source bitmap in pixels.
; G_PARM5       (G_PARM5)       Height of source bitmap in pixels.
; G_PARM6       (G_PARM6)       Width 'field' of src bitmap for BLiTTer flags
; G_PARM7       (G_PARM7)       Address of dest bitmap (must be large enough)
; G_PARM8       (G_PARM8)       X coordinate of *center* of destination
; G_PARM9       (G_PARM9)       Y coordinate of *center* of destination
; G_PARM10      (G_PARM10)      Width 'field' of destination bitmap
;
; Note: No clipping is performed on the destination image; it is assumed to
; fit on the destination raster area.

		.include        "jaguar.inc"
		.include        "jr.inc"

		.globl          LoadAndGoGPU

		.extern         JAGPIC
		.extern         BlitClear

		.68000
		.text

LoadAndGoGPU:
		movem.l	d0/a0-a1,-(sp)

		clr.l	rotval
		move.l  #gpu_bmprot_end,d0
		sub.l   #gpu_bmprot,d0
		lsr.l   #2,d0                   ; Bytes -> Words

		lea     gpu_bmprot,a0           ; Src
		lea     GPU_PROGSTART,a1        ; Dest
.loop:
		move.l  (a0)+,(a1)+             ; Copy code to GPU
		dbra    d0,.loop

.loop2:
		btst    #0,G_CTRL               ; Wait for GPU to be idle
		bne     .loop2

		move.l  #PIXEL16,NUMPLANES      ; Bit Depth
		move.l  rotval,ANGLEVAL         ; Rotation amount (0-2047)

		move.l  #JAGPIC,SRCADDR         ; Source Bitmap Address
		move.l  #64,SRCWIDTH            ; Width of bitmap
		move.l  #64,SRCHEIGHT           ; Height of bitmap
		move.l  #WID64,SRCWIDFLD        ; Width code for blitter

		move.l  #BMP_ADDR,DESTADDR      ; Destination Bitmap Address
		move.l  #BMP_WIDTH/2,DESTXCNTR  ; X Center of Destination
		move.l  #BMP_HEIGHT/2,DESTYCNTR ; Y Center of Destination
		move.l  #WID320,DESTWIDFLD      ; Width code for blitter

		move.l  #GPU_PROGSTART,G_PC     ; Set GPU PC
		move.l  #1,G_CTRL               ; Start GPU

		add.l   #1,rotval               ; Increment rotation
		andi.l  #$7FF,rotval            ; Force range of 0-2048

		bra     .loop2                  ; do it over and over

		movem.l (sp)+,d0/a0-a1
		rts     

		.bss

rotval:        .ds.l   1

		.text
	    
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

gpu_bmprot:
		.gpu
	
dest_x          .equr   r0      ; X coordinate of center of destination
dest_y          .equr   r1      ; Y coordinate of center of destination
xstep           .equr   r2      ; X increment for blitting
ystep           .equr   r3      ; Y increment for blitting
src_x           .equr   r4      ; X and Y coordinates of upper left hand corner of destination
src_y           .equr   r5
dest_w          .equr   r6      ; width of destination rectangle
dest_h          .equr   r7      ; height of destination rectangle
	
pi2             .equr   r10     ; pi/2 (a constant)
sintbl          .equr   r11     ; pointer to trig table
asin            .equr   r12     ; sin in 0.14 format
acos            .equr   r13     ; cos in 0.14 format
half            .equr   r14     ; 1/2 in 16.16 fixed point
index           .equr   r15     ; for index into sine table
mod             .equr   r16     ; to hold remainder of /16
mod16           .equr   r17     ; mask value for above
mask            .equr   r18     ; For AND #$1FC
masklow         .equr   r19     ; $FFFF masks off low WORD

xtemp           .equr   r20     ; various temporary registers
ytemp           .equr   r21
temp            .equr   r22
temp2           .equr   r23

flags           .equr   r24     ; used to build blitter flags

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

		.org    G_RAM

		moveq   #1,pi2
		shlq    #9,pi2          ; pi/2 = 512 in our measurement system

		movei   #$8000,half     ; 1/2 in 16.16 format
		moveq   #$F,mod16       ; for obtaining % 16
		movei   #$1FC,mask      ; to mask valid sine table indexes
		movei   #$FFFF,masklow

		movei   #ANGLEVAL,temp
		movei   #ROM_SINE,sintbl        ; ROM SINE Table in Jerry

		load    (temp),index    ; get angle of rotation
		move    index,mod

		shrq    #4,index        ; Make 0-2048 -> 0-128
		shlq    #2,index        ; Make 0-128.b 0-128.l

; The SINE table in Jerry is laid out a little funny. There are 128 32-bit
; entries but only the low 16 bits are signifigant and are simply sign-extended.
;
; The first entry is the first entry _above_ zero and the last entry is for
; zero. Our example rewinds one entry to compensate for this. Every time the
; index is modified it is mask'ed off to cause the pointer to wrap within
; the table.
;
; Since 128 entries isn't enough to do rotation with more than 2 or 3 degree
; accuracy we interpolate values through averaging to come closer.
;
		subq    #4,index        ; To account for sine table layout.
		and     mask,index

		and     mod16,mod       ; Get angle % 16
		
		load    (index+sintbl),asin

		addq    #4,index        ; Get next entry for interpolation
		and     mask,index

		load    (index+sintbl),temp2
		sub     asin,temp2
		sharq   #4,temp2        ; 1/16 of difference of both entries

		imult   mod,temp2       ; remainder * 1/16 = offset
		add     temp2,asin      ; offset + value1 = interpolated value
		sharq   #1,asin         ; make it a 0.14 value

;;; Now do cosine (cos x = sin x+90)    

		movei   #124,temp       ; (sintbl length)/4 - 4 
		add     temp,index
		and     mask,index      ; New index

		load    (index+sintbl),acos

		addq    #4,index
		and     mask,index

		load    (index+sintbl),temp2
		sub     acos,temp2
		sharq   #4,temp2

		imult   mod,temp2
		add     temp2,acos
		sharq   #1,acos

		movei   #DESTXCNTR,temp ; Center of destination rectangle
		load    (temp),dest_x
		movei   #DESTYCNTR,temp 
		load    (temp),dest_y

		movei   #SRCWIDTH,temp  ; Rectangle extents
		load    (temp),dest_w
		movei   #SRCHEIGHT,temp
		load    (temp),dest_h

; New Height = h*abs(cos) + w*abs(sin)

		move    acos,xtemp
		move    asin,ytemp
		abs     xtemp
		abs     ytemp

		imultn  dest_h,xtemp
		imacn   dest_w,ytemp
		resmac  temp2

; New Width - w*abs(cos) + h*abs(sin)

		imultn  dest_h,ytemp
		imacn   dest_w,xtemp
		resmac  temp

		move    temp2,dest_h
		move    temp,dest_w

		move    half,temp       ; Round off the width and height
		shrq    #2,temp         ; Convert down to 0.14 format
		add     temp,dest_h
		add     temp,dest_w
		sharq   #14,dest_h      ; Convert to integers
		sharq   #14,dest_w

		move    dest_w,xtemp
		move    dest_h,ytemp
		sharq   #1,xtemp
		sharq   #1,ytemp
		sub     xtemp,dest_x    ; upper left corner of the dest. rectangle
		sub     ytemp,dest_y

; Now transform the destination bounding rectangle back to the
; source bitmap. The inverse transformation is:
;       x' = x*cos + y*sin
;       y' = -x*sin + y*cos

		imultn  xtemp,acos
		imacn   ytemp,asin
		resmac  src_x
		shlq    #2,src_x

		neg     xtemp
		imultn  xtemp,asin
		imacn   ytemp,acos
		resmac  src_y
		shlq    #2,src_y
	
		neg     src_x
		neg     src_y

		movei   #SRCWIDTH,temp  ; Width of source bitmap
		load    (temp),xtemp
		shrq    #1,xtemp        ; Divide by 2
		shlq    #16,xtemp       ; Make 16.16 format

		movei   #SRCHEIGHT,temp
		load    (temp),ytemp
		shrq    #1,ytemp        ; Divide by 2
		shlq    #16,ytemp       ; Make 16.16 format

		add     xtemp,src_x     ; Translate to upper left of destination
		add     ytemp,src_y

; wait for the blitter to become free
		movei   #B_CMD,temp2
wloop:
		load    (temp2),temp
		btst    #0,temp
		jr      EQ,wloop
		nop

; now set up the blitter registers
; window A1 is the source
; window A2 is the destination

		movei   #SRCADDR,temp   
		load    (temp),temp
		movei   #A1_BASE,temp2
		store   temp,(temp2)
		
		movei   #PITCH1|XADDINC,flags

		movei   #NUMPLANES,temp         ; PIXEL1, PIXEL16, etc...
		load    (temp),temp
		or      temp,flags

		movei   #SRCWIDFLD,temp         ; WID64, WID192, etc...
		load    (temp),temp
		or      temp,flags

		movei   #A1_FLAGS,temp
		store   flags,(temp)

		move    src_y,temp              ; Integer part of starting pixel
		shrq    #16,temp
		shlq    #16,temp
		move    src_x,temp2
		shrq    #16,temp2
		or      temp2,temp
		movei   #A1_PIXEL,temp2
		store   temp,(temp2)

		move    src_y,temp              ; Fractional part of pixel
		shlq    #16,temp
		move    src_x,temp2
		and     masklow,temp2
		or      temp2,temp
		movei   #A1_FPIXEL,temp2
		store   temp,(temp2)

		move    asin,temp               ; Increment: 
		neg     temp                    ; X increment is cos, Y is -sin
		shrq    #14,temp                ; 0.14 to 16 bit integer
		and     masklow,temp
		shlq    #16,temp
		move    acos,temp2
		shrq    #14,temp2
		and     masklow,temp2
		or      temp2,temp
		movei   #A1_INC,temp2
		store   temp,(temp2)

		move    asin,temp               ; Fractional Increment
		neg     temp
		shlq    #18,temp
		move    acos,temp2
		shlq    #2,temp2                ; 0.14 to 0.16 fixed point
		and     masklow,temp2
		or      temp2,temp
		movei   #A1_FINC,temp2
		store   temp,(temp2)

;
; To step between lines, we must step back by "width" times the x and y increments, and
; then forward by sin (in the x direction) and cos (in the y direction).
;
		move    dest_w,xstep
		move    dest_w,ystep
		neg     xstep
		imult   acos,xstep              ;(1) Note: changing the order to 2,1,4,3,5,6 
		imult   asin,ystep              ;(2)       would avoid all together 2 
		add     asin,xstep              ;(3)       wait cycles here
		add     acos,ystep              ;(4)
		shlq    #2,xstep                ;(5) cos and sin are /4, remember?
		shlq    #2,ystep                ;(6)

		move    ystep,temp
		shrq    #16,temp                
		shlq    #16,temp                ;(1) Note: If 1 and 2 would be swapped
		move    xstep,temp2             ;(2)       this avoids 1 wait cycle
		shrq    #16,temp2
		or      temp2,temp
		movei   #A1_STEP,temp2
		store   temp,(temp2)

		move    ystep,temp
		shlq    #16,temp
		move    xstep,temp2
		shlq    #16,temp2
		shrq    #16,temp2
		or      temp2,temp
		movei   #A1_FSTEP,temp2
		store   temp,(temp2)

		movei   #SRCHEIGHT,temp
		load    (temp),temp
		shlq    #16,temp

		movei   #SRCWIDTH,temp2
		load    (temp2),temp2
		or      temp2,temp
		
		movei   #A1_CLIP,temp2          ; Clip Window is source window extent
		store   temp,(temp2)

; Set up destination pointers

		movei   #DESTADDR,temp          ; Address of destination bitmap
		load    (temp),temp
		movei   #A2_BASE,temp2
		store   temp,(temp2)

		movei   #PITCH1|XADDPIX,flags   ; Contiguous Data/Pixel Mode

		movei   #NUMPLANES,temp         ; OR with plane layout
		load    (temp),temp
		or      temp,flags

		movei   #DESTWIDFLD,temp        ; OR with Blitter width code
		load    (temp),temp
		or      temp,flags

		movei   #A2_FLAGS,temp          
		store   flags,(temp)

		moveq   #1,temp                 ; Step is X = 1, Y = -Width
		shlq    #16,temp
		move    dest_w,temp2
		neg     temp2
		shlq    #16,temp2
		shrq    #16,temp2
		or      temp2,temp
		movei   #A2_STEP,temp2
		store   temp,(temp2)

		move    dest_y,temp             ; Pixel offset into destination buffer
		shlq    #16,temp
		or      dest_x,temp
		movei   #A2_PIXEL,temp2
		store   temp,(temp2)

		move    dest_h,temp             ; Setup Inner/Outer Loop Counts
		shlq    #16,temp
		or      dest_w,temp
		movei   #B_COUNT,temp2
		store   temp,(temp2)

;;;;; Engage...

		movei   #CLIP_A1|SRCEN|UPDA1F|UPDA1|UPDA2|DSTA2|LFU_REPLACE,temp
		movei   #B_CMD,temp2
		store   temp,(temp2)
;
; now turn ourselves off
;
		movei   #G_CTRL,temp2
		moveq   #0,temp
		store   temp,(temp2)
		nop

		.68000
gpu_bmprot_end:
		.end
