From d1d5175548be0b812212eb9919d77c11b905bbf7 Mon Sep 17 00:00:00 2001 From: Nik Henson Date: Mon, 16 Jan 2012 23:21:14 +0000 Subject: [PATCH] New multi-threaded rendering changes that parallelise graphics rendering and PPC execution in order to increase performance on multi-core machines. New gpuMultiThreaded config option to enable/disable multi-threaded rendering (enabled by default, disabling it reverts to previous behaviour). Other rendering optimisations: - texture uploads now only affect appropriate region in the texture sheet, rather than uploading the whole sheet each time - performance of clearing the model caches has been improved New Alt+O key input added to toggle outputting of frame timings for debugging purposes. --- Src/Graphics/Models.cpp | 9 +- Src/Graphics/Render2D.cpp | 54 +----- Src/Graphics/Render2D.h | 6 +- Src/Inputs/Inputs.cpp | 3 +- Src/Inputs/Inputs.h | 3 +- Src/Model3/Model3.cpp | 396 +++++++++++++++++++++++++++++--------- Src/Model3/Model3.h | 51 ++++- Src/Model3/Real3D.cpp | 215 ++++++++++++++++++--- Src/Model3/Real3D.h | 98 ++++++++-- Src/Model3/TileGen.cpp | 207 ++++++++++++++++++-- Src/Model3/TileGen.h | 57 +++++- Src/OSD/SDL/Main.cpp | 26 ++- 12 files changed, 881 insertions(+), 244 deletions(-) diff --git a/Src/Graphics/Models.cpp b/Src/Graphics/Models.cpp index b4ec0f6..fffa5d2 100644 --- a/Src/Graphics/Models.cpp +++ b/Src/Graphics/Models.cpp @@ -834,13 +834,8 @@ void CRender3D::ClearModelCache(ModelCache *Cache) Cache->vboCurOffset = 0; for (int i = 0; i < 2; i++) Cache->curVertIdx[i] = 0; - if (!Cache->dynamic) - memset(Cache->lut, 0xFF, sizeof(INT16)*Cache->lutSize); // set all to -1 - else - { - for (int i = 0; i < Cache->numModels; i++) - Cache->lut[Cache->Models[i].lutIdx] = -1; - } + for (int i = 0; i < Cache->numModels; i++) + Cache->lut[Cache->Models[i].lutIdx] = -1; Cache->numModels = 0; ClearDisplayList(Cache); diff --git a/Src/Graphics/Render2D.cpp b/Src/Graphics/Render2D.cpp index cb1fbfd..c8b117b 100644 --- a/Src/Graphics/Render2D.cpp +++ b/Src/Graphics/Render2D.cpp @@ -737,24 +737,7 @@ void CRender2D::EndFrame(void) Emulation Callbacks ******************************************************************************/ -void CRender2D::WritePalette(unsigned color, UINT32 data) -{ - UINT8 r, g, b, a; - - a = 0xFF * ((data>>15)&1); // decode the RGBA (make alpha 0xFF or 0x00) - a = ~a; // invert it (set on Model 3 means clear pixel) - - if ((data&0x8000)) - r = g = b = 0; - else - { - b = (data>>7)&0xF8; - g = (data>>2)&0xF8; - r = (data<<3)&0xF8; - } - - pal[color] = (a<<24)|(b<<16)|(g<<8)|r; -} + void CRender2D::WriteVRAM(unsigned addr, UINT32 data) { @@ -763,31 +746,6 @@ void CRender2D::WriteVRAM(unsigned addr, UINT32 data) // For now, mark everything as dirty allDirty = true; - - // Palette - if (addr >= 0x100000) - { - unsigned color = (addr-0x100000)/4; // color index - WritePalette(color, data); - } -} - -/* - * InitPalette(): - * - * This must be called from AttachVRAM() to initialize the palette. The reason - * is that because WriteVRAM() always compares incoming data to what is already - * in the VRAM, there is no actual way to initialize the palette by calling - * WriteVRAM() and passing it the initial VRAM contents. It will always fail to - * update because nothing is being changed. - * - * This function fixes the transparent pixel bug that frequently occurred when - * loading save states in Supermodel 0.1a. - */ -void CRender2D::InitPalette(void) -{ - for (int i = 0; i < 0x20000/4; i++) - WritePalette(i, vram[0x100000/4 + i]); } @@ -801,14 +759,19 @@ void CRender2D::AttachRegisters(const UINT32 *regPtr) DebugLog("Render2D attached registers\n"); } +void CRender2D::AttachPalette(const UINT32 *palPtr) +{ + pal = palPtr; + DebugLog("Render2D attached palette\n"); +} + void CRender2D::AttachVRAM(const UINT8 *vramPtr) { vram = (UINT32 *) vramPtr; - InitPalette(); DebugLog("Render2D attached VRAM\n"); } -#define MEMORY_POOL_SIZE (512*512*4+0x20000) +#define MEMORY_POOL_SIZE (512*512*4) bool CRender2D::Init(unsigned xOffset, unsigned yOffset, unsigned xRes, unsigned yRes) { @@ -832,7 +795,6 @@ bool CRender2D::Init(unsigned xOffset, unsigned yOffset, unsigned xRes, unsigned // Set up pointers to memory regions surf = (UINT32 *) memoryPool; - pal = (UINT32 *) &memoryPool[512*512*4]; // Resolution xPixels = xRes; diff --git a/Src/Graphics/Render2D.h b/Src/Graphics/Render2D.h index 44056ed..1ff571b 100644 --- a/Src/Graphics/Render2D.h +++ b/Src/Graphics/Render2D.h @@ -91,6 +91,8 @@ public: */ void AttachRegisters(const UINT32 *regPtr); + void AttachPalette(const UINT32 *palPtr); + /* * AttachVRAM(vramPtr): * @@ -145,11 +147,10 @@ private: void DisplayLayer(int layerNum, GLfloat z); void Setup2D(void); void ColorOffset(GLfloat colorOffset[3], UINT32 reg); - void WritePalette(unsigned color, UINT32 data); - void InitPalette(void); // Data received from tile generator device object const UINT32 *vram; + const UINT32 *pal; const UINT32 *regs; // OpenGL data @@ -172,7 +173,6 @@ private: // Buffers UINT8 *memoryPool; // all memory is allocated here UINT32 *surf; // 512x512x32bpp pixel surface - UINT32 *pal; // 0x20000 byte (32K colors) palette }; diff --git a/Src/Inputs/Inputs.cpp b/Src/Inputs/Inputs.cpp index 88c810b..274af10 100644 --- a/Src/Inputs/Inputs.cpp +++ b/Src/Inputs/Inputs.cpp @@ -50,10 +50,11 @@ CInputs::CInputs(CInputSystem *system) : m_system(system) uiMusicVolDown = AddSwitchInput("UIMusicVolDown", "Decrease Music Volume", GAME_INPUT_UI, "KEY_F9"); uiSoundVolUp = AddSwitchInput("UISoundVolUp", "Increase Sound Volume", GAME_INPUT_UI, "KEY_F12"); uiSoundVolDown = AddSwitchInput("UISoundVolDown", "Decrease Sound Volume", GAME_INPUT_UI, "KEY_F11"); - uiDumpInpState = AddSwitchInput("UIDumpInputState", "Dump Input State", GAME_INPUT_UI, "NONE"); // disabled for release uiClearNVRAM = AddSwitchInput("UIClearNVRAM", "Clear NVRAM", GAME_INPUT_UI, "KEY_ALT+KEY_N"); uiSelectCrosshairs = AddSwitchInput("UISelectCrosshairs", "Select Crosshairs", GAME_INPUT_UI, "KEY_ALT+KEY_I"); uiToggleFrLimit = AddSwitchInput("UIToggleFrameLimit", "Toggle Frame Limiting", GAME_INPUT_UI, "KEY_ALT+KEY_T"); + uiDumpInpState = AddSwitchInput("UIDumpInputState", "Dump Input State", GAME_INPUT_UI, "KEY_ALT+KEY_U"); + uiDumpTimings = AddSwitchInput("UIDumpTimings", "Dump Frame Timings", GAME_INPUT_UI, "KEY_ALT+KEY_O"); #ifdef SUPERMODEL_DEBUGGER uiEnterDebugger = AddSwitchInput("UIEnterDebugger", "Enter Debugger", GAME_INPUT_UI, "KEY_ALT+KEY_B"); #endif diff --git a/Src/Inputs/Inputs.h b/Src/Inputs/Inputs.h index 7a1e754..c07f2eb 100644 --- a/Src/Inputs/Inputs.h +++ b/Src/Inputs/Inputs.h @@ -101,10 +101,11 @@ public: CSwitchInput *uiMusicVolDown; CSwitchInput *uiSoundVolUp; CSwitchInput *uiSoundVolDown; - CSwitchInput *uiDumpInpState; CSwitchInput *uiClearNVRAM; CSwitchInput *uiSelectCrosshairs; CSwitchInput *uiToggleFrLimit; + CSwitchInput *uiDumpInpState; + CSwitchInput *uiDumpTimings; #ifdef SUPERMODEL_DEBUGGER CSwitchInput *uiEnterDebugger; #endif diff --git a/Src/Model3/Model3.cpp b/Src/Model3/Model3.cpp index d9ad31c..d8059b2 100644 --- a/Src/Model3/Model3.cpp +++ b/Src/Model3/Model3.cpp @@ -1910,6 +1910,8 @@ void CModel3::ClearNVRAM(void) void CModel3::RunFrame(void) { + UINT32 start = CThread::GetTicks(); + // See if currently running multi-threaded if (g_Config.multiThreaded) { @@ -1917,39 +1919,60 @@ void CModel3::RunFrame(void) if (!StartThreads()) goto ThreadError; - // Wake threads for sound board (if sync'd) and drive board (if attached) so they can process a frame - if (syncSndBrdThread && !sndBrdThreadSync->Post() || DriveBoard.IsAttached() && !drvBrdThreadSync->Post()) + // Wake threads for PPC main board (if multi-threading GPU), sound board (if sync'd) and drive board (if attached) so they can process a frame + if (g_Config.gpuMultiThreaded && !ppcBrdThreadSync->Post() || + syncSndBrdThread && !sndBrdThreadSync->Post() || + DriveBoard.IsAttached() && !drvBrdThreadSync->Post()) goto ThreadError; - // At the same time, process a single frame for main board (PPC) in this thread - RunMainBoardFrame(); + // If not multi-threading GPU, then run PPC main board for a frame and sync GPUs now in this thread + if (!g_Config.gpuMultiThreaded) + { + RunMainBoardFrame(); + SyncGPUs(); + } + + // Render frame if ready to do so + if (gpusReady) + RenderFrame(); // Enter notify wait critical section if (!notifyLock->Lock()) goto ThreadError; - // Wait for sound board and drive board threads to finish their work (if they haven't done so already) - while (syncSndBrdThread && !sndBrdThreadDone || DriveBoard.IsAttached() && !drvBrdThreadDone) + // Wait for PPC main board, sound board and drive board threads to finish their work (if they are running and haven't finished already) + while (g_Config.gpuMultiThreaded && !ppcBrdThreadDone || + syncSndBrdThread && !sndBrdThreadDone || + DriveBoard.IsAttached() && !drvBrdThreadDone) { if (!notifySync->Wait(notifyLock)) goto ThreadError; } + ppcBrdThreadDone = false; sndBrdThreadDone = false; drvBrdThreadDone = false; // Leave notify wait critical section if (!notifyLock->Unlock()) goto ThreadError; + + // If multi-threading GPU, then sync GPUs last while PPC main board thread is waiting + if (g_Config.gpuMultiThreaded) + SyncGPUs(); } else { - // If not multi-threaded, then just process a single frame for main board, sound board and drive board in turn in this thread + // If not multi-threaded, then just process and render a single frame for PPC main board, sound board and drive board in turn in this thread RunMainBoardFrame(); - SoundBoard.RunFrame(); + SyncGPUs(); + RenderFrame(); + RunSoundBoardFrame(); if (DriveBoard.IsAttached()) - DriveBoard.RunFrame(); + RunDriveBoardFrame(); } - + + frameTicks = CThread::GetTicks() - start; + return; ThreadError: @@ -1957,13 +1980,129 @@ ThreadError: g_Config.multiThreaded = false; } +void CModel3::RunMainBoardFrame(void) +{ + UINT32 start = CThread::GetTicks(); + + // Compute display and VBlank timings + unsigned frameCycles = g_Config.GetPowerPCFrequency()*1000000/60; + unsigned vblCycles = (unsigned) ((float) frameCycles * 2.5f/100.0f); // 2.5% vblank (ridiculously short and wrong but bigger values cause flicker in Daytona) + unsigned dispCycles = frameCycles - vblCycles; + + // VBlank + if (gpusReady) + { + TileGen.BeginVBlank(); + GPU.BeginVBlank(); + IRQ.Assert(0x02); + ppc_execute(vblCycles); + //printf("PC=%08X LR=%08X\n", ppc_get_pc(), ppc_get_lr()); + + /* + * Sound: + * + * Bit 0x20 of the MIDI control port appears to enable periodic interrupts, + * which are used to send MIDI commands. Often games will write 0x27, send + * a series of commands, and write 0x06 to stop. Other games, like Star + * Wars Trilogy and Sega Rally 2, will enable interrupts at the beginning + * by writing 0x37 and will disable/enable interrupts to control command + * output. + */ + //printf("\t-- BEGIN (Ctrl=%02X, IRQEn=%02X, IRQPend=%02X) --\n", midiCtrlPort, IRQ.ReadIRQEnable()&0x40, IRQ.ReadIRQState()); + int irqCount = 0; + while ((midiCtrlPort&0x20)) + //while (midiCtrlPort == 0x27) // 27 triggers IRQ sequence, 06 stops it + { + // Don't waste time firing MIDI interrupts if game has disabled them + if ((IRQ.ReadIRQEnable()&0x40) == 0) + break; + + // Process MIDI interrupt + IRQ.Assert(0x40); + ppc_execute(200); // give PowerPC time to acknowledge IRQ + IRQ.Deassert(0x40); + ppc_execute(200); // acknowledge that IRQ was deasserted (TODO: is this really needed?) + + ++irqCount; + if (irqCount > 128) + { + //printf("\tMIDI FIFO OVERFLOW! (IRQEn=%02X, IRQPend=%02X)\n", IRQ.ReadIRQEnable()&0x40, IRQ.ReadIRQState()); + break; + } + } + //printf("\t-- END --\n"); + //printf("PC=%08X LR=%08X\n", ppc_get_pc(), ppc_get_lr()); + + // End VBlank + GPU.EndVBlank(); + TileGen.EndVBlank(); + IRQ.Assert(0x0D); + } + + // Run the PowerPC for the active display part of the frame + ppc_execute(dispCycles); + //printf("PC=%08X LR=%08X\n", ppc_get_pc(), ppc_get_lr()); + + ppcTicks = CThread::GetTicks() - start; +} + +void CModel3::SyncGPUs(void) +{ + UINT32 start = CThread::GetTicks(); + + syncSize = GPU.SyncSnapshots() + TileGen.SyncSnapshots(); + gpusReady = true; + + syncTicks = CThread::GetTicks() - start; +} + +void CModel3::RenderFrame(void) +{ + UINT32 start = CThread::GetTicks(); + + // Render frame + TileGen.BeginFrame(); + GPU.BeginFrame(); + GPU.RenderFrame(); + GPU.EndFrame(); + TileGen.EndFrame(); + + renderTicks = CThread::GetTicks() - start; +} + +bool CModel3::RunSoundBoardFrame(void) +{ + UINT32 start = CThread::GetTicks(); + + bool bufferFull = SoundBoard.RunFrame(); + + sndTicks = CThread::GetTicks() - start; + + return bufferFull; +} + +void CModel3::RunDriveBoardFrame(void) +{ + UINT32 start = CThread::GetTicks(); + + DriveBoard.RunFrame(); + + drvTicks = CThread::GetTicks() - start; +} + bool CModel3::StartThreads(void) { if (startedThreads) return true; // Create synchronization objects - sndBrdThreadSync = CThread::CreateSemaphore(1); + if (g_Config.gpuMultiThreaded) + { + ppcBrdThreadSync = CThread::CreateSemaphore(0); + if (ppcBrdThreadSync == NULL) + goto ThreadError; + } + sndBrdThreadSync = CThread::CreateSemaphore(0); if (sndBrdThreadSync == NULL) goto ThreadError; sndBrdNotifyLock = CThread::CreateMutex(); @@ -1974,7 +2113,7 @@ bool CModel3::StartThreads(void) goto ThreadError; if (DriveBoard.IsAttached()) { - drvBrdThreadSync = CThread::CreateSemaphore(1); + drvBrdThreadSync = CThread::CreateSemaphore(0); if (drvBrdThreadSync == NULL) goto ThreadError; } @@ -1985,6 +2124,14 @@ bool CModel3::StartThreads(void) if (notifySync == NULL) goto ThreadError; + // Create PPC main board thread, if multi-threading GPU + if (g_Config.gpuMultiThreaded) + { + ppcBrdThread = CThread::CreateThread(StartMainBoardThread, this); + if (ppcBrdThread == NULL) + goto ThreadError; + } + // Create sound board thread (sync'd or unsync'd) if (syncSndBrdThread) sndBrdThread = CThread::CreateThread(StartSoundBoardThreadSyncd, this); @@ -1993,15 +2140,15 @@ bool CModel3::StartThreads(void) if (sndBrdThread == NULL) goto ThreadError; - // Create drive board thread (sync'd), if drive board is attached + // Create drive board thread, if drive board is attached if (DriveBoard.IsAttached()) { - drvBrdThread = CThread::CreateThread(StartDriveBoardThreadSyncd, this); + drvBrdThread = CThread::CreateThread(StartDriveBoardThread, this); if (drvBrdThread == NULL) goto ThreadError; } - // Set audio callback if unsync'd + // Set audio callback if sound board thread is unsync'd if (!syncSndBrdThread) SetAudioCallback(AudioCallback, this); @@ -2026,7 +2173,7 @@ bool CModel3::PauseThreads(void) // Wait for all threads to finish their processing pausedThreads = true; - while (sndBrdThreadRunning || drvBrdThreadRunning) + while (ppcBrdThreadRunning || sndBrdThreadRunning || drvBrdThreadRunning) { if (!notifySync->Wait(notifyLock)) goto ThreadError; @@ -2043,11 +2190,27 @@ ThreadError: return false; } -void CModel3::ResumeThreads(void) +bool CModel3::ResumeThreads(void) { - // No need to use any locking here + if (!startedThreads) + return true; + + // Enter notify critical section + if (!notifyLock->Lock()) + goto ThreadError; + + // Let all threads know that they can continue running pausedThreads = false; - return; + + // Leave notify critical section + if (!notifyLock->Unlock()) + goto ThreadError; + return true; + +ThreadError: + ErrorLog("Threading error in CModel3::ResumeThreads: %s\nSwitching back to single-threaded mode.\n", CThread::GetLastError()); + g_Config.multiThreaded = false; + return false; } void CModel3::StopThreads(void) @@ -2055,7 +2218,7 @@ void CModel3::StopThreads(void) if (!startedThreads) return; - // If sound board not sync'd then remove callback + // If sound board thread is unsync'd then remove audio callback if (!syncSndBrdThread) SetAudioCallback(NULL, NULL); @@ -2068,8 +2231,13 @@ void CModel3::StopThreads(void) void CModel3::DeleteThreadObjects(void) { - // Delete (which in turn kills) sound board and drive board threads + // Delete (which in turn kills) PPC main board, sound board and drive board threads // Note that can do so here safely because threads will always be waiting on their semaphores when this method is called + if (ppcBrdThread != NULL) + { + delete ppcBrdThread; + ppcBrdThread = NULL; + } if (sndBrdThread != NULL) { delete sndBrdThread; @@ -2082,6 +2250,11 @@ void CModel3::DeleteThreadObjects(void) } // Delete synchronization objects + if (ppcBrdThreadSync != NULL) + { + delete ppcBrdThreadSync; + ppcBrdThreadSync = NULL; + } if (sndBrdThreadSync != NULL) { delete sndBrdThreadSync; @@ -2114,9 +2287,28 @@ void CModel3::DeleteThreadObjects(void) } } +void CModel3::DumpTimings(void) +{ + printf("PPC:%3ums%c render:%3ums%c sync:%4uK%c%3ums%c snd:%3ums%c drv:%3ums%c frame:%3ums%c\n", + ppcTicks, (ppcTicks > renderTicks ? '!' : ','), + renderTicks, (renderTicks > ppcTicks ? '!' : ','), + syncSize / 1024, (syncSize / 1024 > 128 ? '!' : ','), syncTicks, (syncTicks > 1 ? '!' : ','), + sndTicks, (sndTicks > 10 ? '!' : ','), + drvTicks, (drvTicks > 10 ? '!' : ','), + frameTicks, (frameTicks > 16 ? '!' : ' ')); +} + +int CModel3::StartMainBoardThread(void *data) +{ + // Call method on CModel3 to run PPC main board thread + CModel3 *model3 = (CModel3*)data; + model3->RunMainBoardThread(); + return 0; +} + int CModel3::StartSoundBoardThread(void *data) { - // Call method on CModel3 to run unsync'd sound board thread + // Call method on CModel3 to run sound board thread (unsync'd) CModel3 *model3 = (CModel3*)data; model3->RunSoundBoardThread(); return 0; @@ -2124,20 +2316,70 @@ int CModel3::StartSoundBoardThread(void *data) int CModel3::StartSoundBoardThreadSyncd(void *data) { - // Call method on CModel3 to run sync'd sound board thread + // Call method on CModel3 to run sound board thread (sync'd) CModel3 *model3 = (CModel3*)data; model3->RunSoundBoardThreadSyncd(); return 0; } -int CModel3::StartDriveBoardThreadSyncd(void *data) +int CModel3::StartDriveBoardThread(void *data) { - // Call method on CModel3 to run sync'd drive board thread + // Call method on CModel3 to run drive board thread CModel3 *model3 = (CModel3*)data; - model3->RunDriveBoardThreadSyncd(); + model3->RunDriveBoardThread(); return 0; } +void CModel3::RunMainBoardThread(void) +{ + for (;;) + { + bool wait = true; + while (wait) + { + // Wait on PPC main board thread semaphore + if (!ppcBrdThreadSync->Wait()) + goto ThreadError; + + // Enter notify critical section + if (!notifyLock->Lock()) + goto ThreadError; + + // Check threads not paused + if (!pausedThreads) + { + wait = false; + ppcBrdThreadRunning = true; + } + + // Leave notify critical section + if (!notifyLock->Unlock()) + goto ThreadError; + } + + // Process a single frame for PPC main board + RunMainBoardFrame(); + + // Enter notify critical section + if (!notifyLock->Lock()) + goto ThreadError; + + // Let other threads know processing has finished + ppcBrdThreadRunning = false; + ppcBrdThreadDone = true; + if (!notifySync->SignalAll()) + goto ThreadError; + + // Leave notify critical section + if (!notifyLock->Unlock()) + goto ThreadError; + } + +ThreadError: + ErrorLog("Threading error in RunMainBoardThread: %s\nSwitching back to single-threaded mode.\n", CThread::GetLastError()); + g_Config.multiThreaded = false; +} + void CModel3::AudioCallback(void *data) { // Call method on CModel3 to wake sound board thread @@ -2151,7 +2393,7 @@ void CModel3::WakeSoundBoardThread(void) if (!sndBrdNotifyLock->Lock()) goto ThreadError; - // Signal to sound board that it should start processing again + // Signal to sound board thread that it should start processing again if (!sndBrdNotifySync->Signal()) goto ThreadError; @@ -2200,11 +2442,22 @@ void CModel3::RunSoundBoardThread(void) goto ThreadError; } - // Keep processing frames until audio buffer is full - bool repeat = true; - // NOTE - performs an unlocked read of pausedThreads here, but this is okay - while (!pausedThreads && !SoundBoard.RunFrame()) + // Keep processing frames until paused or audio buffer is full + while (true) { + // Enter main notify critical section + bool paused; + if (!notifyLock->Lock()) + goto ThreadError; + + paused = pausedThreads; + + // Leave main notify critical section + if (!notifyLock->Unlock()) + goto ThreadError; + + if (paused || RunSoundBoardFrame()) + break; //printf("Rerunning sound board\n"); } @@ -2256,7 +2509,7 @@ void CModel3::RunSoundBoardThreadSyncd(void) } // Process a single frame for sound board - SoundBoard.RunFrame(); + RunSoundBoardFrame(); // Enter notify critical section if (!notifyLock->Lock()) @@ -2278,7 +2531,7 @@ ThreadError: g_Config.multiThreaded = false; } -void CModel3::RunDriveBoardThreadSyncd(void) +void CModel3::RunDriveBoardThread(void) { for (;;) { @@ -2306,7 +2559,7 @@ void CModel3::RunDriveBoardThreadSyncd(void) } // Process a single frame for drive board - DriveBoard.RunFrame(); + RunDriveBoardFrame(); // Enter notify critical section if (!notifyLock->Lock()) @@ -2324,70 +2577,10 @@ void CModel3::RunDriveBoardThreadSyncd(void) } ThreadError: - ErrorLog("Threading error in RunDriveBoardThreadSyncd: %s\nSwitching back to single-threaded mode.\n", CThread::GetLastError()); + ErrorLog("Threading error in RunDriveBoardThread: %s\nSwitching back to single-threaded mode.\n", CThread::GetLastError()); g_Config.multiThreaded = false; } -void CModel3::RunMainBoardFrame(void) -{ - // Compute display and VBlank timings - unsigned frameCycles = g_Config.GetPowerPCFrequency()*1000000/60; - unsigned vblCycles = (unsigned) ((float) frameCycles * 2.5f/100.0f); // 2.5% vblank (ridiculously short and wrong but bigger values cause flicker in Daytona) - unsigned dispCycles = frameCycles - vblCycles; - - // Run the PowerPC for the active display part of the frame - ppc_execute(dispCycles); - //printf("PC=%08X LR=%08X\n", ppc_get_pc(), ppc_get_lr()); - - // VBlank - TileGen.BeginFrame(); - GPU.BeginFrame(); - GPU.RenderFrame(); - IRQ.Assert(0x02); - ppc_execute(vblCycles); - //printf("PC=%08X LR=%08X\n", ppc_get_pc(), ppc_get_lr()); - - /* - * Sound: - * - * Bit 0x20 of the MIDI control port appears to enable periodic interrupts, - * which are used to send MIDI commands. Often games will write 0x27, send - * a series of commands, and write 0x06 to stop. Other games, like Star - * Wars Trilogy and Sega Rally 2, will enable interrupts at the beginning - * by writing 0x37 and will disable/enable interrupts to control command - * output. - */ - //printf("\t-- BEGIN (Ctrl=%02X, IRQEn=%02X, IRQPend=%02X) --\n", midiCtrlPort, IRQ.ReadIRQEnable()&0x40, IRQ.ReadIRQState()); - int irqCount = 0; - while ((midiCtrlPort&0x20)) - //while (midiCtrlPort == 0x27) // 27 triggers IRQ sequence, 06 stops it - { - // Don't waste time firing MIDI interrupts if game has disabled them - if ((IRQ.ReadIRQEnable()&0x40) == 0) - break; - - // Process MIDI interrupt - IRQ.Assert(0x40); - ppc_execute(200); // give PowerPC time to acknowledge IRQ - IRQ.Deassert(0x40); - ppc_execute(200); // acknowledge that IRQ was deasserted (TODO: is this really needed?) - - ++irqCount; - if (irqCount > 128) - { - //printf("\tMIDI FIFO OVERFLOW! (IRQEn=%02X, IRQPend=%02X)\n", IRQ.ReadIRQEnable()&0x40, IRQ.ReadIRQState()); - break; - } - } - //printf("\t-- END --\n"); - //printf("PC=%08X LR=%08X\n", ppc_get_pc(), ppc_get_lr()); - - // End frame - GPU.EndFrame(); - TileGen.EndFrame(); - IRQ.Assert(0x0D); -} - void CModel3::Reset(void) { // Clear memory (but do not modify backup RAM!) @@ -2422,6 +2615,15 @@ void CModel3::Reset(void) if (DriveBoard.IsAttached()) DriveBoard.Reset(); + + gpusReady = false; + ppcTicks = 0; + syncSize = 0; + syncTicks = 0; + renderTicks = 0; + sndTicks = 0; + drvTicks = 0; + frameTicks = 0; DebugLog("Model 3 reset\n"); } @@ -2964,13 +3166,17 @@ CModel3::CModel3(void) startedThreads = false; pausedThreads = false; + ppcBrdThread = NULL; sndBrdThread = NULL; drvBrdThread = NULL; + ppcBrdThreadRunning = false; + ppcBrdThreadDone = false; sndBrdThreadRunning = false; sndBrdThreadDone = false; drvBrdThreadRunning = false; drvBrdThreadDone = false; syncSndBrdThread = false; + ppcBrdThreadSync = NULL; sndBrdThreadSync = NULL; drvBrdThreadSync = NULL; notifyLock = NULL; diff --git a/Src/Model3/Model3.h b/Src/Model3/Model3.h index d214741..cca2292 100644 --- a/Src/Model3/Model3.h +++ b/Src/Model3/Model3.h @@ -28,6 +28,7 @@ #ifndef INCLUDED_MODEL3_H #define INCLUDED_MODEL3_H + /* * CModel3Config: * @@ -36,7 +37,8 @@ class CModel3Config { public: - bool multiThreaded; // Multi-threading (enabled if true) + bool multiThreaded; // Multi-threaded (enabled if true) + bool gpuMultiThreaded; // Multi-threaded rendering (enabled if true) // PowerPC clock frequency in MHz (minimum: 1 MHz) inline void SetPowerPCFrequency(unsigned f) @@ -57,6 +59,7 @@ public: CModel3Config(void) { multiThreaded = true; // enable by default + gpuMultiThreaded = true; // enable by default ppcFrequency = 50*1000000; // 50 MHz } @@ -313,7 +316,14 @@ public: * * Flags that any paused threads should resume running. */ - void ResumeThreads(void); + bool ResumeThreads(void); + + /* + * DumpTimings(void): + * + * Prints all timings for the most recent frame to the console, for debugging purposes. + */ + void DumpTimings(void); /* * CModel3(void): @@ -342,21 +352,28 @@ private: void WriteSystemRegister(unsigned reg, UINT8 data); void Patch(void); - void RunMainBoardFrame(void); // Runs the main board (PPC) for a frame + void RunMainBoardFrame(void); // Runs PPC main board for a frame + void SyncGPUs(void); // Sync's up GPUs in preparation for rendering - must be called when PPC is not running + void RenderFrame(void); // Renders current frame + bool RunSoundBoardFrame(void); // Runs sound board for a frame + void RunDriveBoardFrame(void); // Runs drive board for a frame + bool StartThreads(void); // Starts all threads void StopThreads(void); // Stops all threads void DeleteThreadObjects(void); // Deletes all threads and synchronization objects - static int StartSoundBoardThread(void *data); // Callback to start unsync'd sound board thread - static int StartSoundBoardThreadSyncd(void *data); // Callback to start sync'd sound board thread - static int StartDriveBoardThreadSyncd(void *data); // Callback to start sync'd drive board thread + static int StartMainBoardThread(void *data); // Callback to start PPC main board thread + static int StartSoundBoardThread(void *data); // Callback to start sound board thread (unsync'd) + static int StartSoundBoardThreadSyncd(void *data); // Callback to start sound board thread (sync'd) + static int StartDriveBoardThread(void *data); // Callback to start drive board thread static void AudioCallback(void *data); // Audio buffer callback void WakeSoundBoardThread(void); // Used by audio callback to wake sound board thread when not sync'd with PPC thread - void RunSoundBoardThread(void); // Runs sound board thread unsync'd with PPC thread, ie at full speed - void RunSoundBoardThreadSyncd(void); // Runs sound board thread sync'd in step with PPC thread - void RunDriveBoardThreadSyncd(void); // Runs drive board thread sync'd in step with PPC thread + void RunMainBoardThread(void); // Runs PPC main board thread (sync'd in step with render thread) + void RunSoundBoardThread(void); // Runs sound board thread (unsync'd with render thread, ie at full speed) + void RunSoundBoardThreadSyncd(void); // Runs sound board thread (sync'd in step with render thread) + void RunDriveBoardThread(void); // Runs drive board thread (sync'd in step with render thread) // Game and hardware information const struct GameInfo *Game; @@ -397,17 +414,22 @@ private: PPC_FETCH_REGION PPCFetchRegions[3]; // Multiple threading + bool gpusReady; // True if GPUs are ready to render bool startedThreads; // True if threads have been created and started bool pausedThreads; // True if threads are currently paused - bool syncSndBrdThread; // True if sound board thread should be sync'd with PPC thread + bool syncSndBrdThread; // True if sound board thread should be sync'd in step with render thread + CThread *ppcBrdThread; // PPC main board thread CThread *sndBrdThread; // Sound board thread CThread *drvBrdThread; // Drive board thread + bool ppcBrdThreadRunning; // Flag to indicate PPC main board thread is currently processing + bool ppcBrdThreadDone; // Flag to indicate PPC main board thread has finished processing bool sndBrdThreadRunning; // Flag to indicate sound board thread is currently processing bool sndBrdThreadDone; // Flag to indicate sound board thread has finished processing bool drvBrdThreadRunning; // Flag to indicate drive board thread is currently processing bool drvBrdThreadDone; // Flag to indicate drive board thread has finished processing // Thread synchronization objects + CSemaphore *ppcBrdThreadSync; CSemaphore *sndBrdThreadSync; CMutex *sndBrdNotifyLock; CCondVar *sndBrdNotifySync; @@ -427,6 +449,15 @@ private: CSoundBoard SoundBoard; // Sound board CDSB *DSB; // Digital Sound Board (type determined dynamically at load time) CDriveBoard DriveBoard; // Drive board + + // Frame timings + UINT32 ppcTicks; + UINT32 syncSize; + UINT32 syncTicks; + UINT32 renderTicks; + UINT32 sndTicks; + UINT32 drvTicks; + UINT32 frameTicks; }; diff --git a/Src/Model3/Real3D.cpp b/Src/Model3/Real3D.cpp index b3e56bd..ea29d42 100644 --- a/Src/Model3/Real3D.cpp +++ b/Src/Model3/Real3D.cpp @@ -44,14 +44,30 @@ #include #include "Supermodel.h" -// Offsets of memory regions within Real3D memory pool -#define OFFSET_8C 0 // 4 MB, culling RAM low (at 0x8C000000) -#define OFFSET_8E 0x400000 // 1 MB, culling RAM high (at 0x8E000000) -#define OFFSET_98 0x500000 // 4 MB, polygon RAM (at 0x98000000) -#define OFFSET_TEXRAM 0x900000 // 8 MB, texture RAM -#define OFFSET_TEXFIFO 0x1100000 // 1 MB, texture FIFO -#define MEMORY_POOL_SIZE (0x400000+0x100000+0x400000+0x800000+0x100000) +// Macros that divide memory regions into pages and mark them as dirty when they are written to +#define PAGE_WIDTH 12 +#define PAGE_SIZE (1<>(PAGE_WIDTH+3)] |= 1<<((addr>>PAGE_WIDTH)&7) +// Offsets of memory regions within Real3D memory pool +#define OFFSET_8C 0x0000000 // 4 MB, culling RAM low (at 0x8C000000) +#define OFFSET_8E 0x0400000 // 1 MB, culling RAM high (at 0x8E000000) +#define OFFSET_98 0x0500000 // 4 MB, polygon RAM (at 0x98000000) +#define OFFSET_TEXRAM 0x0900000 // 8 MB, texture RAM +#define OFFSET_TEXFIFO 0x1100000 // 1 MB, texture FIFO +#define MEM_POOL_SIZE_RW (0x400000+0x100000+0x400000+0x800000+0x100000) +#define OFFSET_8C_RO 0x1200000 // 4 MB, culling RAM low (at 0x8C000000) [read-only snapshot] +#define OFFSET_8E_RO 0x1600000 // 1 MB, culling RAM high (at 0x8E000000) [read-only snapshot] +#define OFFSET_98_RO 0x1700000 // 4 MB, polygon RAM (at 0x98000000) [read-only snapshot] +#define OFFSET_TEXRAM_RO 0x1B00000 // 8 MB, texture RAM [read-only snapshot] +#define MEM_POOL_SIZE_RO (0x400000+0x100000+0x400000+0x800000) +#define OFFSET_8C_DIRTY 0x2300000 +#define OFFSET_8E_DIRTY (OFFSET_8C_DIRTY+DIRTY_SIZE(0x400000)) +#define OFFSET_98_DIRTY (OFFSET_8E_DIRTY+DIRTY_SIZE(0x100000)) +#define OFFSET_TEXRAM_DIRTY (OFFSET_98_DIRTY+DIRTY_SIZE(0x400000)) +#define MEM_POOL_SIZE_DIRTY (DIRTY_SIZE(MEM_POOL_SIZE_RO)) +#define MEMORY_POOL_SIZE (MEM_POOL_SIZE_RW+MEM_POOL_SIZE_RO+MEM_POOL_SIZE_DIRTY) /****************************************************************************** Save States @@ -61,7 +77,7 @@ void CReal3D::SaveState(CBlockFile *SaveState) { SaveState->NewBlock("Real3D", __FILE__); - SaveState->Write(memoryPool, MEMORY_POOL_SIZE); + SaveState->Write(memoryPool, MEM_POOL_SIZE_RW); // Don't write out read-only snapshots or dirty page arrays SaveState->Write(&fifoIdx, sizeof(fifoIdx)); SaveState->Write(&vromTextureAddr, sizeof(vromTextureAddr)); SaveState->Write(&vromTextureHeader, sizeof(vromTextureHeader)); @@ -90,8 +106,11 @@ void CReal3D::LoadState(CBlockFile *SaveState) return; } - SaveState->Read(memoryPool, MEMORY_POOL_SIZE); - Render3D->UploadTextures(0,0,2048,2048); + SaveState->Read(memoryPool, MEM_POOL_SIZE_RW); + // If multi-threaded, update read-only snapshots too + if (g_Config.gpuMultiThreaded) + UpdateSnapshots(true); + Render3D->UploadTextures(0, 0, 2048, 2048); SaveState->Read(&fifoIdx, sizeof(fifoIdx)); SaveState->Read(&vromTextureAddr, sizeof(vromTextureAddr)); SaveState->Read(&vromTextureHeader, sizeof(vromTextureHeader)); @@ -117,23 +136,111 @@ void CReal3D::LoadState(CBlockFile *SaveState) Rendering ******************************************************************************/ -void CReal3D::RenderFrame(void) +void CReal3D::BeginVBlank(void) { - //if (commandPortWritten) - Render3D->RenderFrame(); + status |= 2; // VBlank bit +} + +void CReal3D::EndVBlank(void) +{ + error = false; // clear error (just needs to be done once per frame) + status &= ~2; +} + +UINT32 CReal3D::SyncSnapshots(void) +{ + // Update read-only copy of command port flag + commandPortWrittenRO = commandPortWritten; commandPortWritten = false; + + if (!g_Config.gpuMultiThreaded) + return 0; + + // Update read-only queue + queuedUploadTexturesRO = queuedUploadTextures; + queuedUploadTextures.clear(); + + // Update read-only snapshots + return UpdateSnapshots(false); +} + +UINT32 CReal3D::UpdateSnapshot(bool copyWhole, UINT8 *src, UINT8 *dst, unsigned size, UINT8 *dirty) +{ + unsigned dirtySize = DIRTY_SIZE(size); + if (copyWhole) + { + // If updating whole region, then just copy all data in one go + memcpy(dst, src, size); + memset(dirty, 0, dirtySize); + return size; + } + else + { + // Otherwise, loop through dirty pages array to find out what needs to be updated and copy only those parts + UINT32 copied = 0; + UINT8 *pSrc = src; + UINT8 *pDst = dst; + for (unsigned i = 0; i < dirtySize; i++) + { + UINT8 d = dirty[i]; + if (d) + { + for (unsigned j = 0; j < 8; j++) + { + if (d&1) + { + // If not at very end of region, then copy an extra 4 bytes to allow for a possible 32-bit overlap + UINT32 toCopy = (i < dirtySize - 1 || j < 7 ? PAGE_SIZE + 4 : PAGE_SIZE); + memcpy(pDst, pSrc, toCopy); + copied += toCopy; + } + d >>= 1; + pSrc += PAGE_SIZE; + pDst += PAGE_SIZE; + } + dirty[i] = 0; + } + else + { + pSrc += 8 * PAGE_SIZE; + pDst += 8 * PAGE_SIZE; + } + } + return copied; + } +} + +UINT32 CReal3D::UpdateSnapshots(bool copyWhole) +{ + // Update all memory region snapshots + UINT32 cullLoCopied = UpdateSnapshot(copyWhole, (UINT8*)cullingRAMLo, (UINT8*)cullingRAMLoRO, 0x400000, cullingRAMLoDirty); + UINT32 cullHiCopied = UpdateSnapshot(copyWhole, (UINT8*)cullingRAMHi, (UINT8*)cullingRAMHiRO, 0x100000, cullingRAMHiDirty); + UINT32 polyCopied = UpdateSnapshot(copyWhole, (UINT8*)polyRAM, (UINT8*)polyRAMRO, 0x400000, polyRAMDirty); + UINT32 textureCopied = UpdateSnapshot(copyWhole, (UINT8*)textureRAM, (UINT8*)textureRAMRO, 0x800000, textureRAMDirty); + //printf("Read3D copied - cullLo:%4uK, cullHi:%4uK, poly:%4uK, texture:%4uK\n", cullLoCopied / 1024, cullHiCopied / 1024, polyCopied / 1024, textureCopied / 1024); + return cullLoCopied + cullHiCopied + polyCopied + textureCopied; } void CReal3D::BeginFrame(void) { - status |= 2; // VBlank bit + // If multi-threaded, perform now any queued texture uploads to renderer before rendering begins + if (g_Config.gpuMultiThreaded) + { + for (vector::iterator it = queuedUploadTexturesRO.begin(), end = queuedUploadTexturesRO.end(); it != end; it++) + Render3D->UploadTextures(it->x, it->y, it->width, it->height); + } + Render3D->BeginFrame(); } +void CReal3D::RenderFrame(void) +{ + //if (commandPortWrittenRO) + Render3D->RenderFrame(); +} + void CReal3D::EndFrame(void) { - error = false; // clear error (just needs to be done once per frame) - status &= ~2; Render3D->EndFrame(); } @@ -528,7 +635,12 @@ void CReal3D::StoreTexture(unsigned xPos, unsigned yPos, unsigned width, unsigne for (yy = 0; yy < 8; yy++) { for (xx = 0; xx < 8; xx++) + { + if (g_Config.gpuMultiThreaded) + MARK_DIRTY(textureRAMDirty, destOffset * 2); textureRAM[destOffset++] = texData[decode[(yy*8+xx)^1]]; + } + destOffset += 2048-8; // next line } texData += 8*8; // next tile @@ -554,7 +666,11 @@ void CReal3D::StoreTexture(unsigned xPos, unsigned yPos, unsigned width, unsigne { for (xx = 0; xx < 8; xx += 2) { + if (g_Config.gpuMultiThreaded) + MARK_DIRTY(textureRAMDirty, destOffset * 2); textureRAM[destOffset++] = texData[decode[(yy^1)*8+((xx+0)^1)]/2]>>8; + if (g_Config.gpuMultiThreaded) + MARK_DIRTY(textureRAMDirty, destOffset * 2); textureRAM[destOffset++] = texData[decode[(yy^1)*8+((xx+1)^1)]/2]&0xFF; } @@ -564,6 +680,21 @@ void CReal3D::StoreTexture(unsigned xPos, unsigned yPos, unsigned width, unsigne } } } + + // Signal to renderer that textures have changed + // TO-DO: mipmaps? What if a game writes non-mipmap textures to mipmap area? + if (g_Config.gpuMultiThreaded) + { + // If multi-threaded, then queue calls to UploadTextures for render thread to perform at beginning of next frame + QueuedUploadTextures upl; + upl.x = xPos; + upl.y = yPos; + upl.width = width; + upl.height = height; + queuedUploadTextures.push_back(upl); + } + else + Render3D->UploadTextures(xPos, yPos, width, height); } // Texture data will be in little endian format @@ -651,11 +782,6 @@ void CReal3D::UploadTexture(UINT32 header, UINT16 *texData) //printf("unknown texture format %02X\n", header>>24); break; } - - // Signal to renderer that textures have changed - // TO-DO: mipmaps? What if a game writes non-mipmap textures to mipmap area? - //Render3D->UploadTextures(x,y,width,height); - Render3D->UploadTextures(0,0,2048,2048); // TO-DO: should not have to upload all 2048x2048 texels } @@ -736,16 +862,22 @@ void CReal3D::WriteTexturePort(unsigned reg, UINT32 data) void CReal3D::WriteLowCullingRAM(UINT32 addr, UINT32 data) { + if (g_Config.gpuMultiThreaded) + MARK_DIRTY(cullingRAMLoDirty, addr); cullingRAMLo[addr/4] = data; } void CReal3D::WriteHighCullingRAM(UINT32 addr, UINT32 data) { + if (g_Config.gpuMultiThreaded) + MARK_DIRTY(cullingRAMHiDirty, addr); cullingRAMHi[addr/4] = data; } void CReal3D::WritePolygonRAM(UINT32 addr, UINT32 data) { + if (g_Config.gpuMultiThreaded) + MARK_DIRTY(polyRAMDirty, addr); polyRAM[addr/4] = data; } @@ -807,7 +939,11 @@ void CReal3D::Reset(void) error = false; commandPortWritten = false; - + commandPortWrittenRO = false; + + queuedUploadTextures.clear(); + queuedUploadTexturesRO.clear(); + fifoIdx = 0; status = 0; vromTextureAddr = 0; @@ -817,8 +953,9 @@ void CReal3D::Reset(void) dmaStatus = 0; dmaUnknownReg = 0; - memset(memoryPool, 0, MEMORY_POOL_SIZE); - + unsigned memSize = (g_Config.gpuMultiThreaded ? MEMORY_POOL_SIZE : MEM_POOL_SIZE_RW); + memset(memoryPool, 0, memSize); + DebugLog("Real3D reset\n"); } @@ -830,8 +967,15 @@ void CReal3D::Reset(void) void CReal3D::AttachRenderer(CRender3D *Render3DPtr) { Render3D = Render3DPtr; - Render3D->AttachMemory(cullingRAMLo,cullingRAMHi,polyRAM,vrom,textureRAM); + + // If multi-threaded, attach read-only snapshots to renderer instead of real ones + if (g_Config.gpuMultiThreaded) + Render3D->AttachMemory(cullingRAMLoRO, cullingRAMHiRO, polyRAMRO, vrom, textureRAMRO); + else + Render3D->AttachMemory(cullingRAMLo, cullingRAMHi, polyRAM, vrom, textureRAM); + Render3D->SetStep(step); + DebugLog("Real3D attached a Render3D object\n"); } @@ -859,7 +1003,8 @@ void CReal3D::SetStep(int stepID) bool CReal3D::Init(const UINT8 *vromPtr, CBus *BusObjectPtr, CIRQ *IRQObjectPtr, unsigned dmaIRQBit) { - float memSizeMB = (float)MEMORY_POOL_SIZE/(float)0x100000; + unsigned memSize = (g_Config.gpuMultiThreaded ? MEMORY_POOL_SIZE : MEM_POOL_SIZE_RW); + float memSizeMB = (float)memSize/(float)0x100000; // IRQ and bus objects Bus = BusObjectPtr; @@ -867,20 +1012,34 @@ bool CReal3D::Init(const UINT8 *vromPtr, CBus *BusObjectPtr, CIRQ *IRQObjectPtr, dmaIRQ = dmaIRQBit; // Allocate all Real3D RAM regions - memoryPool = new(std::nothrow) UINT8[MEMORY_POOL_SIZE]; + memoryPool = new(std::nothrow) UINT8[memSize]; if (NULL == memoryPool) return ErrorLog("Insufficient memory for Real3D object (needs %1.1f MB).", memSizeMB); - // Set up pointers + // Set up main pointers cullingRAMLo = (UINT32 *) &memoryPool[OFFSET_8C]; cullingRAMHi = (UINT32 *) &memoryPool[OFFSET_8E]; polyRAM = (UINT32 *) &memoryPool[OFFSET_98]; textureRAM = (UINT16 *) &memoryPool[OFFSET_TEXRAM]; textureFIFO = (UINT32 *) &memoryPool[OFFSET_TEXFIFO]; + // If multi-threaded, set up pointers for read-only snapshots and dirty page arrays too + if (g_Config.gpuMultiThreaded) + { + cullingRAMLoRO = (UINT32 *) &memoryPool[OFFSET_8C_RO]; + cullingRAMHiRO = (UINT32 *) &memoryPool[OFFSET_8E_RO]; + polyRAMRO = (UINT32 *) &memoryPool[OFFSET_98_RO]; + textureRAMRO = (UINT16 *) &memoryPool[OFFSET_TEXRAM_RO]; + cullingRAMLoDirty = (UINT8 *) &memoryPool[OFFSET_8C_DIRTY]; + cullingRAMHiDirty = (UINT8 *) &memoryPool[OFFSET_8E_DIRTY]; + polyRAMDirty = (UINT8 *) &memoryPool[OFFSET_98_DIRTY]; + textureRAMDirty = (UINT8 *) &memoryPool[OFFSET_TEXRAM_DIRTY]; + } + // VROM pointer passed to us vrom = (UINT32 *) vromPtr; + DebugLog("Initialized Real3D (allocated %1.1f MB)\n", memSizeMB); return OKAY; } diff --git a/Src/Model3/Real3D.h b/Src/Model3/Real3D.h index a7e1bec..d9e9ab7 100644 --- a/Src/Model3/Real3D.h +++ b/Src/Model3/Real3D.h @@ -29,6 +29,20 @@ #ifndef INCLUDED_REAL3D_H #define INCLUDED_REAL3D_H +/* + * QueuedUploadTextures: + * + * When rendering is multi-threaded, this struct is used to represent a postponed + * call to CRender3D::UploadTextures that will be performed by the render thread + * at the beginning of the next frame, rather than directly in the PPC thread. + */ +struct QueuedUploadTextures +{ + unsigned x; + unsigned y; + unsigned width; + unsigned height; +}; /* * CReal3D: @@ -60,28 +74,56 @@ public: * SaveState Block file to load state information from. */ void LoadState(CBlockFile *SaveState); + + /* + * BeginVBlank(void): + * + * Must be called before the VBlank starts. + */ + void BeginVBlank(void); /* - * RenderFrame(void): + * EndVBlank(void) * - * Traverses the scene database and renders a frame. Must be called after - * BeginFrame() but before EndFrame(). + * Must be called after the VBlank finishes. */ - void RenderFrame(void); - + void EndVBlank(void); + + /* + * SyncSnapshots(void): + * + * Syncs the read-only memory snapshots with the real ones so that rendering + * of the current frame can begin in the render thread. Must be called at the + * end of each frame when both the render thread and the PPC thread have finished + * their work. If multi-threaded rendering is not enabled, then this method does + * nothing. + */ + UINT32 SyncSnapshots(void); + /* * BeginFrame(void): * - * Prepare to render a new frame. Must be called once per frame prior to - * drawing anything. + * Prepares to render a new frame. Must be called once per frame prior to + * drawing anything and must only access read-only snapshots and variables + * since it may be running in a separate thread. */ void BeginFrame(void); + /* + * RenderFrame(void): + * + * Traverses the scene database and renders a frame. Must be called after + * BeginFrame() but before EndFrame() and must only access read-only snapshots + * and variables since it may be running in a separate thread. + */ + void RenderFrame(void); + /* * EndFrame(void): * - * Signals the end of rendering for this frame. Must be called last during - * the frame. + * Signals the end of rendering for this frame. Must be called last during + * the frame and must only access read-only snapshots and variables since it + * may be running in a separate thread. */ void EndFrame(void); @@ -342,7 +384,9 @@ private: unsigned Shift(UINT8 *data, unsigned numBits); void StoreTexture(unsigned xPos, unsigned yPos, unsigned width, unsigned height, UINT16 *texData, unsigned bytesPerTexel); void UploadTexture(UINT32 header, UINT16 *texData); - + UINT32 UpdateSnapshots(bool copyWhole); + UINT32 UpdateSnapshot(bool copyWhole, UINT8 *src, UINT8 *dst, unsigned size, UINT8 *dirty); + // Renderer attached to the Real3D CRender3D *Render3D; @@ -353,18 +397,34 @@ private: // Error flag (to limit errors to once per frame) bool error; // true if an error occurred this frame - + // Real3D memory - UINT8 *memoryPool; // all memory allocated here - UINT32 *cullingRAMLo; // 4MB of culling RAM at 8C000000 - UINT32 *cullingRAMHi; // 1MB of culling RAM at 8E000000 - UINT32 *polyRAM; // 4MB of polygon RAM at 98000000 - UINT16 *textureRAM; // 8MB of internal texture RAM - UINT32 *textureFIFO; // 1MB texture FIFO at 0x94000000 - unsigned fifoIdx; // index into texture FIFO + UINT8 *memoryPool; // all memory allocated here + UINT32 *cullingRAMLo; // 4MB of culling RAM at 8C000000 + UINT32 *cullingRAMHi; // 1MB of culling RAM at 8E000000 + UINT32 *polyRAM; // 4MB of polygon RAM at 98000000 + UINT16 *textureRAM; // 8MB of internal texture RAM + UINT32 *textureFIFO; // 1MB texture FIFO at 0x94000000 + unsigned fifoIdx; // index into texture FIFO UINT32 vromTextureAddr; // VROM texture port address data UINT32 vromTextureHeader; // VROM texture port header data + // Read-only snapshots + UINT32 *cullingRAMLoRO; // 4MB of culling RAM at 8C000000 [read-only snapshot] + UINT32 *cullingRAMHiRO; // 1MB of culling RAM at 8E000000 [read-only snapshot] + UINT32 *polyRAMRO; // 4MB of polygon RAM at 98000000 [read-only snapshot] + UINT16 *textureRAMRO; // 8MB of internal texture RAM [read-only snapshot] + + // Arrays to keep track of dirty pages in memory regions + UINT8 *cullingRAMLoDirty; + UINT8 *cullingRAMHiDirty; + UINT8 *polyRAMDirty; + UINT8 *textureRAMDirty; + + // Queued texture uploads + vector queuedUploadTextures; + vector queuedUploadTexturesRO; // Read-only copy of queue + // Big endian bus object for DMA memory access CBus *Bus; @@ -383,6 +443,7 @@ private: // Command port bool commandPortWritten; + bool commandPortWrittenRO; // Read-only copy of flag // Status and command registers UINT32 status; @@ -394,7 +455,6 @@ private: unsigned tapIDSize; // size of ID data in bits unsigned tapTDO; // bit shifted out to TDO int tapState; // current state - }; diff --git a/Src/Model3/TileGen.cpp b/Src/Model3/TileGen.cpp index d3ef61d..385b2a6 100644 --- a/Src/Model3/TileGen.cpp +++ b/Src/Model3/TileGen.cpp @@ -34,6 +34,23 @@ #include #include "Supermodel.h" +// Macros that divide memory regions into pages and mark them as dirty when they are written to +#define PAGE_WIDTH 10 +#define PAGE_SIZE (1<>(PAGE_WIDTH+3)] |= 1<<((addr>>PAGE_WIDTH)&7) + +// Offsets of memory regions within TileGen memory pool +#define OFFSET_VRAM 0x000000 +#define OFFSET_PAL 0x120000 +#define MEM_POOL_SIZE_RW (0x120000+0x020000) +#define OFFSET_VRAM_RO 0x140000 // [read-only snapshot] +#define OFFSET_PAL_RO 0x260000 // [read-only snapshot] +#define MEM_POOL_SIZE_RO (0x120000+0x020000) +#define OFFSET_VRAM_DIRTY 0x280000 +#define OFFSET_PAL_DIRTY (OFFSET_VRAM_DIRTY+DIRTY_SIZE(0x120000)) +#define MEM_POOL_SIZE_DIRTY (DIRTY_SIZE(MEM_POOL_SIZE_RO)) +#define MEMORY_POOL_SIZE (MEM_POOL_SIZE_RW+MEM_POOL_SIZE_RO+MEM_POOL_SIZE_DIRTY) /****************************************************************************** Save States @@ -42,7 +59,7 @@ void CTileGen::SaveState(CBlockFile *SaveState) { SaveState->NewBlock("Tile Generator", __FILE__); - SaveState->Write(memoryPool, 0x100000+0x20000); + SaveState->Write(vram, 0x120000); // Don't write out palette, read-only snapshots or dirty page arrays, just VRAM SaveState->Write(regs, sizeof(regs)); } @@ -55,16 +72,17 @@ void CTileGen::LoadState(CBlockFile *SaveState) } // Load memory one word at a time - for (int i = 0; i < (0x100000+0x20000); i += 4) + for (int i = 0; i < 0x120000; i += 4) { UINT32 data; - - SaveState->Read(&data, sizeof(data)); - Render2D->WriteVRAM(i, data); - *(UINT32 *) &memoryPool[i] = data; - } + SaveState->Read(&data, sizeof(data)); + WriteRAM(i, data); + } SaveState->Read(regs, sizeof(regs)); + // If multi-threaded, update read-only snapshots too + if (g_Config.gpuMultiThreaded) + UpdateSnapshots(true); } @@ -72,9 +90,8 @@ void CTileGen::LoadState(CBlockFile *SaveState) Rendering ******************************************************************************/ -void CTileGen::BeginFrame(void) +void CTileGen::BeginVBlank(void) { - Render2D->BeginFrame(); /* printf("08: %X\n", regs[0x08/4]); printf("0C: %X\n", regs[0x0C/4]); @@ -88,25 +105,144 @@ void CTileGen::BeginFrame(void) */ } +void CTileGen::EndVBlank(void) +{ + // +} + +UINT32 CTileGen::SyncSnapshots(void) +{ + if (!g_Config.gpuMultiThreaded) + return 0; + + // Update read-only snapshots + return UpdateSnapshots(false); +} + +UINT32 CTileGen::UpdateSnapshot(bool copyWhole, UINT8 *src, UINT8 *dst, unsigned size, UINT8 *dirty) +{ + unsigned dirtySize = DIRTY_SIZE(size); + if (copyWhole) + { + // If updating whole region, then just copy all data in one go + memcpy(dst, src, size); + memset(dirty, 0, dirtySize); + return size; + } + else + { + // Otherwise, loop through dirty pages array to find out what needs to be updated and copy only those parts + UINT32 copied = 0; + UINT8 *pSrc = src; + UINT8 *pDst = dst; + for (unsigned i = 0; i < dirtySize; i++) + { + UINT8 d = dirty[i]; + if (d) + { + for (unsigned j = 0; j < 8; j++) + { + if (d&1) + { + // If not at very end of region, then copy an extra 4 bytes to allow for a possible 32-bit overlap + UINT32 toCopy = (i < dirtySize - 1 || j < 7 ? PAGE_SIZE + 4 : PAGE_SIZE); + memcpy(pDst, pSrc, toCopy); + copied += toCopy; + } + d >>= 1; + pSrc += PAGE_SIZE; + pDst += PAGE_SIZE; + } + dirty[i] = 0; + } + else + { + pSrc += 8 * PAGE_SIZE; + pDst += 8 * PAGE_SIZE; + } + } + return copied; + } +} + +UINT32 CTileGen::UpdateSnapshots(bool copyWhole) +{ + // Update all memory region snapshots + UINT32 palCopied = UpdateSnapshot(copyWhole, (UINT8*)pal, (UINT8*)palRO, 0x020000, palDirty); + UINT32 vramCopied = UpdateSnapshot(copyWhole, (UINT8*)vram, (UINT8*)vramRO, 0x120000, vramDirty); + memcpy(regsRO, regs, sizeof(regs)); // Always copy whole of regs buffer + //printf("TileGen copied - pal:%4uK, vram:%4uK, regs:%uK\n", palCopied / 1024, vramCopied / 1024, sizeof(regs) / 1024); + return palCopied + vramCopied + sizeof(regs); +} + +void CTileGen::BeginFrame(void) +{ + // NOTE: Render2D->WriteVRAM(addr, data) is no longer being called for RAM addresses that are written + // to and instead this class relies upon the fact that Render2D currently marks everything as dirty + // with every frame. If this were to change in the future then code to handle marking the correct + // parts of the renderer as dirty would need to be added here. + + Render2D->BeginFrame(); +} + void CTileGen::EndFrame(void) { Render2D->EndFrame(); } - /****************************************************************************** Emulation Functions ******************************************************************************/ UINT32 CTileGen::ReadRAM(unsigned addr) { - return *(UINT32 *) &memoryPool[addr]; + return *(UINT32 *) &vram[addr]; } void CTileGen::WriteRAM(unsigned addr, UINT32 data) { - Render2D->WriteVRAM(addr,data); // inform renderer of update first - *(UINT32 *) &memoryPool[addr] = data; + if (g_Config.gpuMultiThreaded) + MARK_DIRTY(vramDirty, addr); + *(UINT32 *) &vram[addr] = data; + + // Update palette if required + if (addr >= 0x100000) + { + addr -= 0x100000; + unsigned color = addr/4; // color index + if (g_Config.gpuMultiThreaded) + MARK_DIRTY(palDirty, addr); + WritePalette(color, data); + } +} + +void CTileGen::InitPalette(void) +{ + for (int i = 0; i < 0x20000/4; i++) + { + WritePalette(i, vram[0x100000/4 + i]); + if (g_Config.gpuMultiThreaded) + palRO[i] = pal[i]; + } +} + +void CTileGen::WritePalette(unsigned color, UINT32 data) +{ + UINT8 r, g, b, a; + + a = 0xFF * ((data>>15)&1); // decode the RGBA (make alpha 0xFF or 0x00) + a = ~a; // invert it (set on Model 3 means clear pixel) + + if ((data&0x8000)) + r = g = b = 0; + else + { + b = (data>>7)&0xF8; + g = (data>>2)&0xF8; + r = (data<<3)&0xF8; + } + + pal[color] = (a<<24)|(b<<16)|(g<<8)|r; } void CTileGen::WriteRegister(unsigned reg, UINT32 data) @@ -136,8 +272,13 @@ void CTileGen::WriteRegister(unsigned reg, UINT32 data) void CTileGen::Reset(void) { + unsigned memSize = (g_Config.gpuMultiThreaded ? MEMORY_POOL_SIZE : MEM_POOL_SIZE_RW); + memset(memoryPool, 0, memSize); memset(regs, 0, sizeof(regs)); - memset(memoryPool, 0, 0x120000); + memset(regsRO, 0, sizeof(regsRO)); + + InitPalette(); + DebugLog("Tile Generator reset\n"); } @@ -149,22 +290,48 @@ void CTileGen::Reset(void) void CTileGen::AttachRenderer(CRender2D *Render2DPtr) { Render2D = Render2DPtr; - Render2D->AttachVRAM(memoryPool); - Render2D->AttachRegisters(regs); + + // If multi-threaded, attach read-only snapshots to renderer instead of real ones + if (g_Config.gpuMultiThreaded) + { + Render2D->AttachVRAM(vramRO); + Render2D->AttachPalette(palRO); + Render2D->AttachRegisters(regsRO); + } + else + { + Render2D->AttachVRAM(vram); + Render2D->AttachPalette(pal); + Render2D->AttachRegisters(regs); + } + DebugLog("Tile Generator attached a Render2D object\n"); } -#define MEMORY_POOL_SIZE 0x120000 bool CTileGen::Init(CIRQ *IRQObjectPtr) { - float memSizeMB = (float)MEMORY_POOL_SIZE/(float)0x100000; + unsigned memSize = (g_Config.gpuMultiThreaded ? MEMORY_POOL_SIZE : MEM_POOL_SIZE_RW); + float memSizeMB = (float)memSize/(float)0x100000; - // Allocate all memory for ROMs and PPC RAM - memoryPool = new(std::nothrow) UINT8[MEMORY_POOL_SIZE]; + // Allocate all memory for all TileGen RAM regions + memoryPool = new(std::nothrow) UINT8[memSize]; if (NULL == memoryPool) return ErrorLog("Insufficient memory for tile generator object (needs %1.1f MB).", memSizeMB); + // Set up main pointers + vram = (UINT8 *) &memoryPool[OFFSET_VRAM]; + pal = (UINT32 *) &memoryPool[OFFSET_PAL]; + + // If multi-threaded, set up pointers for read-only snapshots and dirty page arrays too + if (g_Config.gpuMultiThreaded) + { + vramRO = (UINT8 *) &memoryPool[OFFSET_VRAM_RO]; + palRO = (UINT32 *) &memoryPool[OFFSET_PAL_RO]; + vramDirty = (UINT8 *) &memoryPool[OFFSET_VRAM_DIRTY]; + palDirty = (UINT8 *) &memoryPool[OFFSET_PAL_DIRTY]; + } + // Hook up the IRQ controller IRQ = IRQObjectPtr; diff --git a/Src/Model3/TileGen.h b/Src/Model3/TileGen.h index f0d9b7a..4d1e0de 100644 --- a/Src/Model3/TileGen.h +++ b/Src/Model3/TileGen.h @@ -58,22 +58,49 @@ public: */ void LoadState(CBlockFile *SaveState); + /* + * BeginVBlank(void): + * + * Must be called before the VBlank starts. + */ + void BeginVBlank(void); + + /* + * EndVBlank(void) + * + * Must be called after the VBlank finishes. + */ + void EndVBlank(void); + + /* + * SyncSnapshots(void): + * + * Syncs the read-only memory snapshots with the real ones so that rendering + * of the current frame can begin in the render thread. Must be called at the + * end of each frame when both the render thread and the PPC thread have finished + * their work. If multi-threaded rendering is not enabled, then this method does + * nothing. + */ + UINT32 SyncSnapshots(void); + /* * BeginFrame(void): * - * Prepare to render a new frame. Must be called once per frame prior to - * drawing anything. + * Prepares to render a new frame. Must be called once per frame prior to + * drawing anything and must only access read-only snapshots and variables + * since it may be running in a separate thread. */ void BeginFrame(void); - + /* * EndFrame(void): * - * Signals the end of rendering for this frame. Must be called last during - * the frame. + * Signals the end of rendering for this frame. Must be called last during + * the frame and must only access read-only snapshots and variables since it + * may be running in a separate thread. */ void EndFrame(void); - + /* * ReadRAM(addr): * @@ -163,14 +190,32 @@ public: ~CTileGen(void); private: + // Private member functions + void InitPalette(void); + void WritePalette(unsigned color, UINT32 data); + UINT32 UpdateSnapshots(bool copyWhole); + UINT32 UpdateSnapshot(bool copyWhole, UINT8 *src, UINT8 *dst, unsigned size, UINT8 *dirty); + CIRQ *IRQ; // IRQ controller the tile generator is attached to CRender2D *Render2D; // 2D renderer the tile generator is attached to // Tile generator VRAM UINT8 *memoryPool; // all memory allocated here + UINT8 *vram; // 1.8MB of VRAM + UINT32 *pal; // 0x20000 byte (32K colors) palette + + // Read-only snapshots + UINT8 *vramRO; // 1.8MB of VRAM [read-only snapshot] + UINT32 *palRO; // 0x20000 byte (32K colors) palette [read-only snapshot] + // Arrays to keep track of dirty pages in memory regions + UINT8 *vramDirty; + UINT8 *palDirty; + // Registers UINT32 regs[64]; + UINT32 regsRO[64]; // Read-only copy of registers + }; diff --git a/Src/OSD/SDL/Main.cpp b/Src/OSD/SDL/Main.cpp index 0f076a3..dbca5e4 100644 --- a/Src/OSD/SDL/Main.cpp +++ b/Src/OSD/SDL/Main.cpp @@ -360,6 +360,8 @@ static void ApplySettings(CINIFile *INI, const char *section) // Model 3 if (OKAY == INI->Get(section, "MultiThreaded", x)) g_Config.multiThreaded = x ? true : false; + if (OKAY == INI->Get(section, "GPUMultiThreaded", x)) + g_Config.gpuMultiThreaded = x ? true : false; if (OKAY == INI->Get(section, "PowerPCFrequency", x)) g_Config.SetPowerPCFrequency(x); @@ -381,7 +383,7 @@ static void ApplySettings(CINIFile *INI, const char *section) #ifdef SUPERMODEL_WIN32 if (OKAY == INI->Get(section, "ForceFeedback", x)) g_Config.forceFeedback = x ? true : false; -#endif +#endif // SUPERMODEL_WIN32 // OSD INI->Get(section, "XResolution", g_Config.xRes); @@ -457,6 +459,7 @@ static void LogConfig(void) // CModel3Config InfoLog("\tMultiThreaded = %d", g_Config.multiThreaded); + InfoLog("\tGPUMultiThreaded = %d", g_Config.gpuMultiThreaded); InfoLog("\tPowerPCFrequency = %d", g_Config.GetPowerPCFrequency()); // CSoundBoardConfig @@ -709,8 +712,9 @@ int Supermodel(const char *zipFile, CInputs *Inputs, CINIFile *CmdLine) unsigned fpsFramesElapsed, framesElapsed; unsigned showCrosshairs = 0; // bit 1: player 1 crosshair, bit 0: player 2 bool gameHasLightguns = false; - bool quit = 0; - bool paused = 0; + bool quit = false; + bool paused = false; + bool dumpTimings = false; // Initialize and load ROMs if (OKAY != Model3->Init()) @@ -791,7 +795,7 @@ int Supermodel(const char *zipFile, CInputs *Inputs, CINIFile *CmdLine) // Poll the inputs if (!Inputs->Poll(Model3->GetGameInfo(), xOffset, yOffset, xRes, yRes)) - quit = 1; + quit = true; #ifdef SUPERMODEL_DEBUGGER bool processUI = true; @@ -802,12 +806,12 @@ int Supermodel(const char *zipFile, CInputs *Inputs, CINIFile *CmdLine) // Check if debugger requests exit or pause if (Debugger->CheckExit()) { - quit = 1; + quit = true; processUI = false; } else if (Debugger->CheckPause()) { - paused = 1; + paused = true; processUI = false; } } @@ -819,7 +823,7 @@ int Supermodel(const char *zipFile, CInputs *Inputs, CINIFile *CmdLine) if (Inputs->uiExit->Pressed()) { // Quit emulator - quit = 1; + quit = true; } else if (Inputs->uiReset->Pressed()) { @@ -979,9 +983,12 @@ int Supermodel(const char *zipFile, CInputs *Inputs, CINIFile *CmdLine) // Dump input states Inputs->DumpState(Model3->GetGameInfo()); } + else if (Inputs->uiDumpTimings->Pressed()) + { + dumpTimings = !dumpTimings; + } else if (Inputs->uiSelectCrosshairs->Pressed() && gameHasLightguns) { - showCrosshairs++; switch ((showCrosshairs&3)) { @@ -1042,6 +1049,9 @@ int Supermodel(const char *zipFile, CInputs *Inputs, CINIFile *CmdLine) startTicks = currentTicks; } } + + if (dumpTimings && !paused) + Model3->DumpTimings(); } // Make sure all threads are paused before shutting down