Supermodel/Src/Graphics/New3D/New3D.cpp

#include "New3D.h"
#include "Vec.h"
#include <cmath>
#include <algorithm>
#include <limits>
#include <cstring>
#include <unordered_map>
#include "R3DFloat.h"
#include "Util/BitCast.h"

#define MAX_RAM_VERTS 300000
#define MAX_ROM_VERTS 1500000

#define BYTE_TO_FLOAT(B)	((2.0f * (B) + 1.0f) * (float)(1.0/255.0))

namespace New3D {

CNew3D::CNew3D(const Util::Config::Node &config, const std::string& gameName) :
	m_r3dShader(config),
	m_r3dScrollFog(config),
	m_gameName(gameName),
	m_textureBuffer(0),
	m_vao(0)
{
	m_cullingRAMLo	= nullptr;
	m_cullingRAMHi	= nullptr;
	m_polyRAM		= nullptr;
	m_vrom			= nullptr;
	m_textureRAM	= nullptr;
	m_sunClamp		= true;
	m_shadeIsSigned = true;
	m_numPolyVerts	= 3;
	m_primType		= GL_TRIANGLES;

	if (config["QuadRendering"].ValueAs<bool>()) {
		m_numPolyVerts	= 4;
		m_primType		= GL_LINES_ADJACENCY;
	}

	m_r3dShader.LoadShader();
	glUseProgram(0);

	// setup our texture memory

	glGenTextures(1, &m_textureBuffer);
	glBindTexture(GL_TEXTURE_2D, m_textureBuffer);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	glTexImage2D(GL_TEXTURE_2D, 0, GL_R16UI, 2048, 2048, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, nullptr);	// allocate storage

	// setup up our vertex buffer memory

	glGenVertexArrays(1, &m_vao);
	glBindVertexArray(m_vao);
	m_vbo.Create(GL_ARRAY_BUFFER, GL_DYNAMIC_DRAW, sizeof(FVertex) * (MAX_RAM_VERTS + MAX_ROM_VERTS));
	m_vbo.Bind(true);

	glEnableVertexAttribArray(m_r3dShader.GetVertexAttribPos("inVertex"));
	glEnableVertexAttribArray(m_r3dShader.GetVertexAttribPos("inNormal"));
	glEnableVertexAttribArray(m_r3dShader.GetVertexAttribPos("inTexCoord"));
	glEnableVertexAttribArray(m_r3dShader.GetVertexAttribPos("inColour"));
	glEnableVertexAttribArray(m_r3dShader.GetVertexAttribPos("inFaceNormal"));
	glEnableVertexAttribArray(m_r3dShader.GetVertexAttribPos("inFixedShade"));

	// before draw, specify vertex and index arrays with their offsets, offsetof is maybe evil ..
	glVertexAttribPointer(m_r3dShader.GetVertexAttribPos("inVertex"), 4, GL_FLOAT, GL_FALSE, sizeof(FVertex), 0);
	glVertexAttribPointer(m_r3dShader.GetVertexAttribPos("inNormal"), 3, GL_FLOAT, GL_FALSE, sizeof(FVertex), (void*)offsetof(FVertex, normal));
	glVertexAttribPointer(m_r3dShader.GetVertexAttribPos("inTexCoord"), 2, GL_FLOAT, GL_FALSE, sizeof(FVertex), (void*)offsetof(FVertex, texcoords));
	glVertexAttribPointer(m_r3dShader.GetVertexAttribPos("inColour"), 4, GL_UNSIGNED_BYTE, GL_TRUE, sizeof(FVertex), (void*)offsetof(FVertex, faceColour));
	glVertexAttribPointer(m_r3dShader.GetVertexAttribPos("inFaceNormal"), 3, GL_FLOAT, GL_FALSE, sizeof(FVertex), (void*)offsetof(FVertex, faceNormal));
	glVertexAttribPointer(m_r3dShader.GetVertexAttribPos("inFixedShade"), 1, GL_FLOAT, GL_FALSE, sizeof(FVertex), (void*)offsetof(FVertex, fixedShade));

	glBindVertexArray(0);
	m_vbo.Bind(false);
}

CNew3D::~CNew3D()
{
	m_vbo.Destroy();
	if (m_vao) {
		glDeleteVertexArrays(1, &m_vao);
		m_vao = 0;
	}

	if (m_textureBuffer) {
		glDeleteTextures(1, &m_textureBuffer);
		m_textureBuffer = 0;
	}

	m_r3dShader.UnloadShader();
}

void CNew3D::AttachMemory(const UINT32 *cullingRAMLoPtr, const UINT32 *cullingRAMHiPtr, const UINT32 *polyRAMPtr, const UINT32 *vromPtr, const UINT16 *textureRAMPtr)
{
	m_cullingRAMLo	= cullingRAMLoPtr;
	m_cullingRAMHi	= cullingRAMHiPtr;
	m_polyRAM		= polyRAMPtr;
	m_vrom			= vromPtr;
	m_textureRAM	= textureRAMPtr;
}

void CNew3D::SetStepping(int stepping)
{
	m_step = stepping;

	if ((m_step != 0x10) && (m_step != 0x15) && (m_step != 0x20) && (m_step != 0x21)) {
		m_step = 0x10;
	}

	if (m_step > 0x10) {
		m_offset = 0;							// culling nodes are 10 words
		m_vertexFactor = (1.0f / 2048.0f);		// vertices are in 13.11 format
	}
	else {
		m_offset = 2;							// 8 words
		m_vertexFactor = (1.0f / 128.0f);		// 17.7
	}
}

bool CNew3D::Init(unsigned xOffset, unsigned yOffset, unsigned xRes, unsigned yRes, unsigned totalXResParam, unsigned totalYResParam)
{
	// Resolution and offset within physical display area
	m_xRatio	= xRes * (float)(1.0 / 496.0);
	m_yRatio	= yRes * (float)(1.0 / 384.0);
	m_xOffs		= xOffset;
	m_yOffs		= yOffset;
	m_xRes		= xRes;
	m_yRes		= yRes;
	m_totalXRes	= totalXResParam;
	m_totalYRes = totalYResParam;

	m_r3dFrameBuffers.DestroyFBO();		// remove any old ones if created
	m_r3dFrameBuffers.CreateFBO(totalXResParam, totalYResParam);

	return OKAY;
}

void CNew3D::UploadTextures(unsigned level, unsigned x, unsigned y, unsigned width, unsigned height)
{
	glBindTexture(GL_TEXTURE_2D, m_textureBuffer);
	glPixelStorei(GL_UNPACK_ALIGNMENT, 2);

	for (unsigned i = 0; i < height; i++) {
		glTexSubImage2D(GL_TEXTURE_2D, 0, x, y + i, width, 1, GL_RED_INTEGER, GL_UNSIGNED_SHORT, m_textureRAM + ((y + i) * 2048) + x);
	}
}

void CNew3D::DrawScrollFog()
{
	// this is my best guess at the logic based upon what games are doing
	//
	// ocean hunter		- every viewport has scroll fog values set. Must start with lowest priority layers as the higher ones sometimes are garbage
	// scud race		- first viewports in priority layer missing scroll values. The latter ones all contain valid scroll values.
	// daytona			- doesn't seem to use scroll fog at all. Will set scroll values for the first viewports, the end ones contain no scroll values. End credits have scroll fog, but constrained to the viewport
	// vf3				- first viewport only has it set. But set with highest select value ?? Rest of the viewports in priority layer contain a lower select value
	// sega bassfishing	- first viewport in priority 1 sets scroll value. The rest all contain the wrong value + a higher select value ..
	// spikeout final	- 2nd viewport in the priority layer has scroll values set, none of the others do. It also uses the highest select value

	// I think the basic logic is this: the real3d picks the highest scroll fog value, starting from the lowest priority layer.
	// If it finds a value for priority layer 0 for example, it then bails out looking for any more.
	// Fogging seems to be constrained to whatever the viewport is that is set.
	// Scroll fog needs a density or start value to work, but these can come from another viewport if the fog colour is the same

	Node* nodePtr = nullptr;

	for (int i = 0; i < 4 && !nodePtr; i++) {
		for (auto &n : m_nodes) {
			if (n.viewport.priority == i) {
				if (n.viewport.scrollFog > 0.f) {

					// check to see if we have a higher scroll fog value
					if (nodePtr) {
						if (nodePtr->viewport.scrollFog < n.viewport.scrollFog) {
							nodePtr = &n;
						}

						continue;
					}

					nodePtr = &n;
				}
			}
		}
	}

	if (nodePtr) {

		// interate nodes to see if any viewports with that fog colour actually set a fog density or start value
		// if both of these are zero fogging is effectively disabled

		for (auto& n : m_nodes) {

			if (nodePtr->viewport.fogParams[0] == n.viewport.fogParams[0] &&
				nodePtr->viewport.fogParams[1] == n.viewport.fogParams[1] &&
				nodePtr->viewport.fogParams[2] == n.viewport.fogParams[2])
			{
				// check to see if we have a fog start or density value

				if (n.viewport.fogParams[3] > 0.0f || n.viewport.fogParams[4] > 0.0f || n.viewport.scrollAtt > 0.0f) {

					float rgba[4];
					auto& vp = nodePtr->viewport;
					rgba[0] = vp.fogParams[0];
					rgba[1] = vp.fogParams[1];
					rgba[2] = vp.fogParams[2];
					rgba[3] = vp.scrollFog;
					glViewport(vp.x, vp.y, vp.width, vp.height);
					m_r3dScrollFog.DrawScrollFog(rgba, n.viewport.scrollAtt, n.viewport.fogParams[6], n.viewport.spotFogColor, n.viewport.spotEllipse);
					break;
				}
			}
		}
	}
}

void CNew3D::DrawAmbientFog()
{
	// logic here is still not totally understood
	// some games are setting fog ambient which seems to darken the 2d background layer too when scroll fogging is not set
	// The logic is something like tileGenColour * fogAmbient
	// If fogAmbient = 1.0 it's a no-op. Lower values darken the image
	// Does this work with scroll fog? Well technically scroll fog already takes into account the fog ambient as it darkens the fog colour

	// Let's pick the lowest fog ambient value
	// Check for fog density or a fog start value, otherwise the effect seems to be disabled (lost world)

	float fogAmbient = 1.0f;
	Node* nodePtr = nullptr;

	for (auto& n : m_nodes) {

		// check to see if we have a fog density or fog start
		if (n.viewport.fogParams[3] <= 0.0f && n.viewport.fogParams[4] <= 0.0f) {
			continue;
		}

		if (n.viewport.scrollAtt > 0.0f) {
			continue;		// scroll attenuation indicates scroll fog layer
		}

		if (n.viewport.fogParams[6] < fogAmbient) {
			nodePtr = &n;
			fogAmbient = n.viewport.fogParams[6];
		}
	}

	if (nodePtr) {
		auto& vp = nodePtr->viewport;
		float rgba[] = { 0.0f, 0.0f, 0.0f, 1.0f - fogAmbient };
		glViewport(vp.x, vp.y, vp.width, vp.height);
		m_r3dScrollFog.DrawScrollFog(rgba, 0.0f, 1.0f, vp.spotFogColor, vp.spotEllipse); // we assume spot light is not used
	}
}

bool CNew3D::RenderScene(int priority, bool renderOverlay, Layer layer)
{
	glActiveTexture(GL_TEXTURE0);
	glBindTexture(GL_TEXTURE_2D, m_textureBuffer);

	bool hasOverlay = false;		// (high priority polys)

	for (auto &n : m_nodes) {

		if (n.viewport.priority != priority || n.models.empty()) {
			continue;
		}

		CalcViewport(&n.viewport, std::abs(m_nfPairs[priority].zNear*0.96f), std::abs(m_nfPairs[priority].zFar*1.05f));	// make planes 5% bigger
		glViewport(n.viewport.x, n.viewport.y, n.viewport.width, n.viewport.height);

		m_r3dShader.SetViewportUniforms(&n.viewport);

		for (auto &m : n.models) {

			bool matrixLoaded = false;

			if (m.meshes->empty()) {
				continue;
			}

			for (auto &mesh : *m.meshes) {

				if (mesh.highPriority) {
					hasOverlay = true;
				}

				if (!mesh.Render(layer, m.alpha)) continue;
				if (mesh.highPriority != renderOverlay) continue;

				if (!matrixLoaded) {
					m_r3dShader.SetModelStates(&m);
					matrixLoaded = true;		// do this here to stop loading matrices we don't need. Ie when rendering non transparent etc
				}

				m_r3dShader.SetMeshUniforms(&mesh);
				glDrawArrays(m_primType, mesh.vboOffset, mesh.vertexCount);
			}
		}
	}

	return hasOverlay;
}

bool CNew3D::SkipLayer(int layer)
{
	for (const auto &n : m_nodes) {
		if (n.viewport.priority == layer) {
			if (!n.models.empty()) {
				return false;
			}
		}
	}

	return true;
}

void CNew3D::SetRenderStates()
{
	m_vbo.Bind(true);
	glBindVertexArray(m_vao);

	m_r3dShader.SetShader(true);

	glDepthFunc		(GL_LEQUAL);
	glEnable		(GL_DEPTH_TEST);
	glDepthMask		(GL_TRUE);
	glActiveTexture	(GL_TEXTURE0);
	glDisable		(GL_CULL_FACE);					// we'll emulate this in the shader

	glEnable		(GL_STENCIL_TEST);
	glStencilMask	(0xFF);

	glBlendFunc		(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
	glDisable		(GL_BLEND);
}

void CNew3D::DisableRenderStates()
{
	m_vbo.Bind(false);
	glBindVertexArray(0);

	m_r3dShader.SetShader(false);

	glDisable(GL_STENCIL_TEST);
}

void CNew3D::RenderFrame(void)
{
	for (int i = 0; i < 4; i++) {
		m_nfPairs[i].zNear = -std::numeric_limits<float>::max();
		m_nfPairs[i].zFar  =  std::numeric_limits<float>::max();
	}

	{
		std::lock_guard<std::mutex> guard(m_losMutex);
		std::swap(m_losBack, m_losFront);
		for (int i = 0; i < 4; i++) {
			m_losBack->value[i] = 0;
		}
	}

	// release any resources from last frame
	m_polyBufferRam.clear();		// clear dynamic model memory buffer
	m_nodes.clear();				// memory will grow during the object life time, that's fine, no need to shrink to fit
	m_modelMat.Release();			// would hope we wouldn't need this but no harm in checking
	m_nodeAttribs.Reset();

	RenderViewport(0x800000);						// build model structure

	m_vbo.Bind(true);
	m_vbo.BufferSubData(MAX_ROM_VERTS*sizeof(FVertex), m_polyBufferRam.size()*sizeof(FVertex), m_polyBufferRam.data());	// upload all the dynamic data to GPU in one go

	if (!m_polyBufferRom.empty()) {

		// sync rom memory with vbo
		int romBytes	= (int)m_polyBufferRom.size() * sizeof(FVertex);
		int vboBytes	= m_vbo.GetSize();
		int size		= romBytes - vboBytes;

		if (size) {
			//check we haven't blown up the memory buffers
			//we will lose rom models for 1 frame is this happens, not the end of the world, as probably won't ever happen anyway
			if (m_polyBufferRom.size() >= MAX_ROM_VERTS) {
				m_polyBufferRom.clear();
				m_romMap.clear();
				m_vbo.Reset();
			}
			else {
				m_vbo.AppendData(size, &m_polyBufferRom[vboBytes / sizeof(FVertex)]);
			}
		}
	}

	m_r3dFrameBuffers.SetFBO(Layer::colour);		// colour will draw to all 3 buffers. For regular opaque pixels the transparent layers will be essentially masked
	glClear(GL_COLOR_BUFFER_BIT);

	DrawAmbientFog();
	DrawScrollFog();								// fog layer if applicable must be drawn here

	for (int pri = 0; pri <= 3; pri++) {

		if (SkipLayer(pri)) continue;

		for (int i = 0; i < 2; i++) {

			bool renderOverlay = (i == 1);

			SetRenderStates();

			m_r3dFrameBuffers.SetFBO(Layer::colour);

			glClear(GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);

			m_r3dShader.DiscardAlpha(true);
			m_r3dShader.SetLayer(Layer::colour);
			bool hasOverlay = RenderScene(pri, renderOverlay, Layer::colour);

			if (!renderOverlay) {
				ProcessLos(pri);
			}

			glDepthFunc(GL_LESS);

			m_r3dShader.DiscardAlpha(false);

			m_r3dShader.SetLayer(Layer::trans1);
			m_r3dFrameBuffers.SetFBO(Layer::trans1);
			RenderScene(pri, renderOverlay, Layer::trans1);

			m_r3dShader.SetLayer(Layer::trans2);
			m_r3dFrameBuffers.SetFBO(Layer::trans2);
			RenderScene(pri, renderOverlay, Layer::trans2);

			DisableRenderStates();

			if (!hasOverlay) break;								// no high priority polys
		}
	}

	m_r3dFrameBuffers.SetFBO(Layer::none);
	m_r3dFrameBuffers.Draw();
}

void CNew3D::BeginFrame(void)
{
}

void CNew3D::EndFrame(void)
{
}

/******************************************************************************
Real3D Address Translation

Functions that interpret word-granular Real3D addresses and return pointers.
******************************************************************************/

// Translates 24-bit culling RAM addresses
const UINT32* CNew3D::TranslateCullingAddress(UINT32 addr)
{
	addr &= 0x00FFFFFF;	// caller should have done this already

	if ((addr >= 0x800000) && (addr < 0x840000)) {
		return &m_cullingRAMHi[addr & 0x3FFFF];
	}
	else if (addr < 0x100000) {
		return &m_cullingRAMLo[addr];
	}

	return NULL;
}

// Translates model references
const UINT32* CNew3D::TranslateModelAddress(UINT32 modelAddr)
{
	modelAddr &= 0x00FFFFFF;	// caller should have done this already

	if (modelAddr < 0x100000) {
		return &m_polyRAM[modelAddr];
	}
	else {
		return &m_vrom[modelAddr];
	}
}

bool CNew3D::DrawModel(UINT32 modelAddr)
{
	const UINT32*	modelAddress;
	bool			cached = false;
	Model*			m;

	modelAddress = TranslateModelAddress(modelAddr);

	// create a new model to push onto the vector
	m_nodes.back().models.emplace_back();

	// get the last model in the array
	m = &m_nodes.back().models.back();

	if (IsVROMModel(modelAddr) && !IsDynamicModel((UINT32*)modelAddress)) {

		// try to find meshes in the rom cache

		m->meshes = m_romMap[modelAddr];	// will create an entry with a null pointer if empty

		if (m->meshes) {
			cached = true;
		}
		else {
			m->meshes = std::make_shared<std::vector<Mesh>>();
			m_romMap[modelAddr] = m->meshes;		// store meshes in our rom map here
		}

		m->dynamic = false;
	}
	else {
		m->meshes = std::make_shared<std::vector<Mesh>>();
	}

	// copy current model matrix
	for (int i = 0; i < 16; i++) {
		m->modelMat[i] = m_modelMat.currentMatrix[i];
	}

	// update texture offsets
	m->textureOffsetX	= m_nodeAttribs.currentTexOffsetX;
	m->textureOffsetY	= m_nodeAttribs.currentTexOffsetY;
	m->page				= m_nodeAttribs.currentPage;
	m->scale			= m_nodeAttribs.currentModelScale;
	m->alpha			= m_nodeAttribs.currentModelAlpha;

	if (!cached) {
		CacheModel(m, modelAddress);
	}

	if (m_nodeAttribs.currentClipStatus != Clip::INSIDE) {
		ClipModel(m);	// not storing clipped values, only working out the Z range
	}

	return true;
}

/*
	0x00:   x------- -------- -------- --------	Is UF ref
			-x------ -------- -------- --------	Is 3D model
			--x----- -------- -------- --------	Is point
			---x---- -------- -------- --------	Is point ref
			----x--- -------- -------- --------	Is animation
			-----x-- -------- -------- --------	Is billboard
			------x- -------- -------- --------	Child is billboard
			-------x -------- -------- --------	Extra child pointer needed
			-------- xxxxx--- -------- --------	Spare (unknown if used)
			-------- -----xxx xxxxxx-- --------	Node ID
			-------- -------- ------x- --------	Discard 1
			-------- -------- -------x --------	Discard 2

			-------- -------- -------- x-------	Reset matrix
			-------- -------- -------- -x------	Use child pointer
			-------- -------- -------- --x-----	Use sibling pointer
			-------- -------- -------- ---x----	No matrix
			-------- -------- -------- ----x---	Indirect child
			-------- -------- -------- -----x--	Valid color table
			-------- -------- -------- ------xx	Node type(0 = viewport, 1 = root node, 2 = culling node)

	0x01, 0x02 only present on Step 1.5+

	0x01:   xxxxxxxx xxxxxxxx xxxxxxxx xxxxxx--	Model scale (float32) last 2 bits are control words
			-------- -------- -------- ------x- Disable culling
			-------- -------- -------- -------x	Valid model scale

	0x02 :	-------- -------- x------- --------	Texture replace
			-------- -------- -x------ --------	Switch bank
			-------- -------- --xxxxxx x-------	X offset
			-------- -------- -------- -xxxxxxx	Y offset

	0x03 :	xxxxxxxx xxxxx--- -------- --------	Color table address 1
			-------- -----xxx xxxx---- --------	LOD table pointer
			-------- -------- ----xxxx xxxxxxxx	Node matrix

	0x04:   Translation X coordinate
	0x05:   Translation Y coordinate
	0x06:   Translation Z coordinate

	0x07:   xxxx---- -------- -------- -------- Color table address 2
			-----x-- -------- -------- -------- Sibling table
			------x- -------- -------- -------- Point
			-------x -------- -------- -------- Leaf node
			-------- xxxxxxxx xxxxxxxx xxxxxxxx Child pointer

	0x08:   xxxxxxx- -------- -------- -------- Color table address 3
			-------x -------- -------- -------- Null sibling
			-------- xxxxxxxx xxxxxxxx xxxxxxxx Sibling pointer

	0x09:   xxxxxxxx xxxxxxxx -------- -------- Blend radius
			-------- -------- xxxxxxxx xxxxxxxx Culling radius
*/

void CNew3D::DescendCullingNode(UINT32 addr)
{
	enum class NodeType { undefined = -1, viewport = 0, rootNode = 1, cullingNode = 2 };

	const UINT32	*node, *lodPtr;
	UINT32			matrixOffset, child1Ptr, sibling2Ptr;
	BBox			bbox;
	UINT16			uCullRadius;
	float			fCullRadius;
	UINT16			uBlendRadius;
	float			fBlendRadius;
	UINT8			lodTablePointer;
	NodeType		nodeType;
	bool			resetMatrix;

	if (m_nodeAttribs.StackLimit()) {
		return;
	}

	node = TranslateCullingAddress(addr);

	if (NULL == node) {
		return;
	}

	// Extract known fields
	nodeType		= (NodeType)(node[0x00] & 3);
	child1Ptr		= node[0x07 - m_offset] & 0x7FFFFFF;	// mask colour table bits
	sibling2Ptr		= node[0x08 - m_offset] & 0x1FFFFFF;	// mask colour table bits
	matrixOffset	= node[0x03 - m_offset] & 0xFFF;
	resetMatrix		= (node[0x0] & 0x80) > 0;
	lodTablePointer = (node[0x03 - m_offset] >> 12) & 0x7F;

	// check our node type
	if (nodeType == NodeType::viewport) {
		return;												// viewport nodes aren't rendered
	}

	// node discard
	if ((0x300 & node[0]) == 0x300) {						// why 2 bits for node discard? Sega rally uses this
		return;
	}

	// parse siblings
	if ((node[0x00] & 0x07) != 0x06) {						// colour table seems to indicate no siblings
		if (!(sibling2Ptr & 0x1000000) && sibling2Ptr) {
			DescendCullingNode(sibling2Ptr);				// no need to mask bit, would already be zero
		}
	}

	if ((node[0x00] & 0x04)) {
		m_colorTableAddr = ((node[0x03 - m_offset] >> 19) << 0) | ((node[0x07 - m_offset] >> 28) << 13) | ((node[0x08 - m_offset] >> 25) << 17);
		m_colorTableAddr &= 0x000FFFFF; // clamp to 4MB (in words) range
	}

	m_nodeAttribs.Push();	// save current attribs

	if (!m_offset) {		// Step 1.5+

		if (node[0x01] & 1)
			m_nodeAttribs.currentModelScale = Util::Uint32AsFloat(node[0x01] & ~3);	// mask out control bits

		if (node[0x01] & 2)
			m_nodeAttribs.currentDisableCulling = true;

		// apply texture offsets, else retain current ones
		if ((node[0x02] & 0x8000))	{
			int tx = 32 * ((node[0x02] >> 7) & 0x3F);
			int ty = 32 * (node[0x02] & 0x1F);
			m_nodeAttribs.currentTexOffsetX	= tx;
			m_nodeAttribs.currentTexOffsetY = ty;
			m_nodeAttribs.currentPage = (node[0x02] & 0x4000) >> 14;
		}
	}

	// Apply matrix and translation
	m_modelMat.PushMatrix();

	// apply translation vector
	if (node[0x00] & 0x10) {
		float x = Util::Uint32AsFloat(node[0x04 - m_offset]);
		float y = Util::Uint32AsFloat(node[0x05 - m_offset]);
		float z = Util::Uint32AsFloat(node[0x06 - m_offset]);
		m_modelMat.Translate(x, y, z);
	}
	// multiply matrix, if specified
	else if (matrixOffset) {
		MultMatrix(matrixOffset,m_modelMat);
	}

	if (resetMatrix) {
		ResetMatrix(m_modelMat);
	}

	uCullRadius = node[9 - m_offset] & 0xFFFF;
	fCullRadius = R3DFloat::GetFloat16(uCullRadius);

	uBlendRadius = node[9 - m_offset] >> 16;
	fBlendRadius = R3DFloat::GetFloat16(uBlendRadius);

	if (m_nodeAttribs.currentClipStatus != Clip::INSIDE) {

		if (uCullRadius != R3DFloat::Pro16BitMax) {

			CalcBox(fCullRadius, bbox);
			TransformBox(m_modelMat, bbox);

			m_nodeAttribs.currentClipStatus = ClipBox(bbox, m_planes);

			if (m_nodeAttribs.currentClipStatus == Clip::INSIDE) {
				CalcBoxExtents(bbox);
			}
		}
		else {
			m_nodeAttribs.currentClipStatus = Clip::NOT_SET;
		}
	}

	float LODscale = fBlendRadius * m_nodeAttribs.currentModelScale / std::abs(m_modelMat.currentMatrix[14]);

	const LODFeatureType& lodTableEntry = m_LODBlendTable->table[lodTablePointer];

	if (m_nodeAttribs.currentDisableCulling)
	{
		m_nodeAttribs.currentModelAlpha = 1.0f;
	}
	else
	{
		float nodeAlpha = lodTableEntry.lod[3].blendFactor * (LODscale - lodTableEntry.lod[3].deleteSize);
		nodeAlpha = std::clamp(nodeAlpha, 0.0f, 1.0f);
		m_nodeAttribs.currentModelAlpha *= nodeAlpha;	// alpha of each node multiples by the alpha of its parent
	}

	if (m_nodeAttribs.currentClipStatus != Clip::OUTSIDE && m_nodeAttribs.currentModelAlpha > 0.0f) {

		// Descend down first link
		if ((node[0x00] & 0x08))	// 4-element LOD table
		{
			lodPtr = TranslateCullingAddress(child1Ptr);

			// determine which LOD to use; we do not currently blend between LODs
			int modelLOD;
			for (modelLOD = 0; modelLOD < 3; modelLOD++)
			{
				if (LODscale >= lodTableEntry.lod[modelLOD].deleteSize)
					break;
			}

			if (NULL != lodPtr) {
				if ((node[0x03 - m_offset] & 0x20000000)) {
					DescendCullingNode(lodPtr[modelLOD] & 0xFFFFFF);
				}
				else {
					DrawModel(lodPtr[modelLOD] & 0xFFFFFF);
				}
			}
		}
		else {
			DescendNodePtr(child1Ptr);
		}

	}

	m_modelMat.PopMatrix();

	// Restore old texture offsets
	m_nodeAttribs.Pop();
}

void CNew3D::DescendNodePtr(UINT32 nodeAddr)
{
	// Ignore null links
	if ((nodeAddr & 0x00FFFFFF) == 0) {
		return;
	}

	switch ((nodeAddr >> 24) & 0x5)		// pointer type encoded in upper 8 bits
	{
	case 0x00:
		DescendCullingNode(nodeAddr & 0xFFFFFF);
		break;
	case 0x01:
		DrawModel(nodeAddr & 0xFFFFFF);
		break;
	case 0x04:
		DescendPointerList(nodeAddr & 0xFFFFFF);
		break;
	default:
		break;
	}
}

void CNew3D::DescendPointerList(UINT32 addr)
{
	const UINT32*	list;
	UINT32			nodeAddr;
	int				index;

	list = TranslateCullingAddress(addr);

	if (NULL == list) {
		return;
	}

	index = 0;

	while (true) {

		if (list[index] & 0x01000000) {
			break;	// empty list
		}

		nodeAddr = list[index] & 0x00FFFFFF;	// clear upper 8 bits to ensure this is processed as a culling node

		DescendCullingNode(nodeAddr);

		if (list[index] & 0x02000000) {
			break;	// list end
		}

		index++;
	}
}


/******************************************************************************
Matrix Stack
******************************************************************************/

// Macro to generate column-major (OpenGL) index from y,x subscripts
#define CMINDEX(y,x)	(x*4+y)

/*
* MultMatrix():
*
* Multiplies the matrix stack by the specified Real3D matrix. The matrix
* index is a 12-bit number specifying a matrix number relative to the base.
* The base matrix MUST be set up before calling this function.
*/
void CNew3D::MultMatrix(UINT32 matrixOffset, Mat4& mat)
{
	GLfloat		m[4*4];
	const float	*src = &m_matrixBasePtr[matrixOffset * 12];

	if (m_matrixBasePtr == NULL)	// LA Machineguns
		return;

	m[CMINDEX(0, 0)] = src[3];
	m[CMINDEX(0, 1)] = src[4];
	m[CMINDEX(0, 2)] = src[5];
	m[CMINDEX(0, 3)] = src[0];
	m[CMINDEX(1, 0)] = src[6];
	m[CMINDEX(1, 1)] = src[7];
	m[CMINDEX(1, 2)] = src[8];
	m[CMINDEX(1, 3)] = src[1];
	m[CMINDEX(2, 0)] = src[9];
	m[CMINDEX(2, 1)] = src[10];
	m[CMINDEX(2, 2)] = src[11];
	m[CMINDEX(2, 3)] = src[2];
	m[CMINDEX(3, 0)] = 0.0;
	m[CMINDEX(3, 1)] = 0.0;
	m[CMINDEX(3, 2)] = 0.0;
	m[CMINDEX(3, 3)] = 1.0;

	mat.MultMatrix(m);
}

/*
* InitMatrixStack():
*
* Initializes the modelview (model space -> view space) matrix stack and
* Real3D coordinate system. These are the last transforms to be applied (and
* the first to be defined on the stack) before projection.
*
* Model 3 games tend to define the following unusual base matrix:
*
*		0	0	-1	0
*		1	0	0	0
*		0	-1	0	0
*		0	0	0	1
*
* When this is multiplied by a column vector, the output is:
*
*		-Z
*		X
*		-Y
*		1
*
* My theory is that the Real3D GPU accepts vectors in Z,X,Y order. The games
* store everything as X,Y,Z and perform the translation at the end. The Real3D
* also has Y and Z coordinates opposite of the OpenGL convention. This
* function inserts a compensating matrix to undo these things.
*
* NOTE: This function assumes we are in GL_MODELVIEW matrix mode.
*/

void CNew3D::InitMatrixStack(UINT32 matrixBaseAddr, Mat4& mat)
{
	GLfloat m[4 * 4];

	// This matrix converts vectors back from the weird Model 3 Z,X,Y ordering
	// and also into OpenGL viewspace (-Y,-Z)
	m[CMINDEX(0, 0)] = 0.0;	m[CMINDEX(0, 1)] = 1.0;	m[CMINDEX(0, 2)] = 0.0;	m[CMINDEX(0, 3)] = 0.0;
	m[CMINDEX(1, 0)] = 0.0;	m[CMINDEX(1, 1)] = 0.0;	m[CMINDEX(1, 2)] =-1.0; m[CMINDEX(1, 3)] = 0.0;
	m[CMINDEX(2, 0)] =-1.0; m[CMINDEX(2, 1)] = 0.0;	m[CMINDEX(2, 2)] = 0.0;	m[CMINDEX(2, 3)] = 0.0;
	m[CMINDEX(3, 0)] = 0.0;	m[CMINDEX(3, 1)] = 0.0;	m[CMINDEX(3, 2)] = 0.0;	m[CMINDEX(3, 3)] = 1.0;

	mat.LoadMatrix(m);

	// Set matrix base address and apply matrix #0 (coordinate system matrix)
	m_matrixBasePtr = (float *)TranslateCullingAddress(matrixBaseAddr);
	MultMatrix(0, mat);
}

// what this does is to set the rotation back to zero, whilst keeping the position and scale of the current matrix
void CNew3D::ResetMatrix(Mat4& mat)
{
	float m[16];
	memcpy(m, mat.currentMatrix, 16 * 4);

	// transpose the top 3x3 of the matrix (this effectively inverts the rotation). When we multiply our new matrix it'll effectively cancel out the rotations.
	std::swap(m[1], m[4]);
	std::swap(m[2], m[8]);
	std::swap(m[6], m[9]);

	// set position to zero
	m[12] = 0;
	m[13] = 0;
	m[14] = 0;
	m[15] = 1;

	// normalise columns, this removes the scaling, otherwise we'll apply it twice
	float s1 = std::sqrt((m[0] * m[0]) + (m[1] * m[1]) + (m[2] * m[2]));
	float s2 = std::sqrt((m[4] * m[4]) + (m[5] * m[5]) + (m[6] * m[6]));
	float s3 = std::sqrt((m[8] * m[8]) + (m[9] * m[9]) + (m[10] * m[10]));

	m[0] /= s1;		m[4] /= s2;		m[8] /= s3;
	m[1] /= s1;		m[5] /= s2;		m[9] /= s3;
	m[2] /= s1;		m[6] /= s2;		m[10] /= s3;

	mat.MultMatrix(m);
}

// Draws viewports of the given priority
void CNew3D::RenderViewport(UINT32 addr)
{
	static const GLfloat	color[8][3] =
	{											// RGB1 color translation
		{ 0.0f, 0.0f, 0.0f },	// off
		{ 0.0f, 0.0f, 1.0f },	// blue
		{ 0.0f, 1.0f, 0.0f },	// green
		{ 0.0f, 1.0f, 1.0f },	// cyan
		{ 1.0f, 0.0f, 0.0f }, 	// red
		{ 1.0f, 0.0f, 1.0f },	// purple
		{ 1.0f, 1.0f, 0.0f },	// yellow
		{ 1.0f, 1.0f, 1.0f }	// white
	};

	if ((addr & 0x00FFFFFF) == 0) {
		return;
	}

	// Translate address and obtain pointer
	const uint32_t *vpnode = TranslateCullingAddress(addr);

	if (NULL == vpnode) {
		return;
	}

	if (!(vpnode[0] & 0x20)) {	// only if viewport enabled

		// create node object
		m_nodes.emplace_back(Node());
		m_nodes.back().models.reserve(2048);				// create space for models

		// get pointer to its viewport
		Viewport *vp = &m_nodes.back().viewport;

		vp->priority	= (vpnode[0] >> 3) & 0x3;
		vp->select		= (vpnode[0] >> 8) & 0x3;
		vp->number		= (vpnode[0] >> 10);
		m_currentPriority = vp->priority;

		// Fetch viewport parameters (TO-DO: would rounding make a difference?)
		vp->vpX			= (int)(((vpnode[0x1A] & 0xFFFF) * (float)(1.0 / 16.0)) + 0.5f);		// viewport X (12.4 fixed point)
		vp->vpY			= (int)(((vpnode[0x1A] >> 16) * (float)(1.0 / 16.0)) + 0.5f);			// viewport Y (12.4)
		vp->vpWidth		= (int)(((vpnode[0x14] & 0xFFFF) * (float)(1.0 / 4.0)) + 0.5f);		// width (14.2)
		vp->vpHeight	= (int)(((vpnode[0x14] >> 16) * (float)(1.0 / 4.0)) + 0.5f);			// height (14.2)

		uint32_t matrixBase	= vpnode[0x16] & 0xFFFFFF;							// matrix base address

		m_LODBlendTable = (LODBlendTable*)TranslateCullingAddress(vpnode[0x17] & 0xFFFFFF);

		/*
		vp->angle_left		= -atan2f(Util::Uint32AsFloat(vpnode[12]),  Util::Uint32AsFloat(vpnode[13]));	// These values work out as the normals for the clipping planes.
		vp->angle_right		=  atan2f(Util::Uint32AsFloat(vpnode[16]), -Util::Uint32AsFloat(vpnode[17]));	// Sometimes these values (dirt devils,lost world) are totally wrong
		vp->angle_top		=  atan2f(Util::Uint32AsFloat(vpnode[14]),  Util::Uint32AsFloat(vpnode[15]));	// and don't work for the frustum values exactly.
		vp->angle_bottom	= -atan2f(Util::Uint32AsFloat(vpnode[18]), -Util::Uint32AsFloat(vpnode[19]));	// Perhaps they are just used for culling and not rendering.
		*/

		float cv = Util::Uint32AsFloat(vpnode[0x8]);	// 1/(left-right)
		float cw = Util::Uint32AsFloat(vpnode[0x9]);	// 1/(top-bottom)
		float io = Util::Uint32AsFloat(vpnode[0xa]);	// top / bottom (ratio) - ish
		float jo = Util::Uint32AsFloat(vpnode[0xb]);	// left / right (ratio)

		vp->angle_left		= (0.0f - jo) / cv;
		vp->angle_right		= (1.0f - jo) / cv;
		vp->angle_bottom	= -(1.0f - io)/ cw;
		vp->angle_top		= -(0.0f - io)/ cw;

		// calculate the frustum shape, near/far pair are dummy values
		CalcViewport(vp, 1.f, 1000.f);

		// calculate frustum planes
		CalcFrustumPlanes(m_planes, vp->projectionMatrix);	// we need to calc a 'projection matrix' to get the correct frustum planes for clipping

		// Lighting (note that sun vector points toward sun -- away from vertex)
		vp->lightingParams[0] =  Util::Uint32AsFloat(vpnode[0x05]);							// sun X
		vp->lightingParams[1] = -Util::Uint32AsFloat(vpnode[0x06]);							// sun Y (- to convert to ogl cordinate system)
		vp->lightingParams[2] = -Util::Uint32AsFloat(vpnode[0x04]);							// sun Z (- to convert to ogl cordinate system)
		vp->lightingParams[3] = std::max(0.f, std::min(Util::Uint32AsFloat(vpnode[0x07]), 1.0f));	// sun intensity (clamp to 0-1)
		vp->lightingParams[4] = (float)((vpnode[0x24] >> 8) & 0xFF) * (float)(1.0 / 255.0);	// ambient intensity
		vp->lightingParams[5] = 0.0f;	// reserved

		vp->sunClamp		= m_sunClamp;
		vp->intensityClamp	= (m_step == 0x10);		// just step 1.0 ?
		vp->hardwareStep	= m_step;

		// Spotlight
		int spotColorIdx = (vpnode[0x20] >> 11) & 7;									// spotlight color index
		int spotFogColorIdx = (vpnode[0x20] >> 8) & 7;									// spotlight on fog color index
		vp->spotEllipse[0] = (float)(INT16)(vpnode[0x1E] & 0xFFFF) * (float)(1.0 / 8.0);// spotlight X position (13.3 fixed point)
		vp->spotEllipse[1] = (float)(INT16)(vpnode[0x1D] & 0xFFFF) * (float)(1.0 / 8.0);// spotlight Y
		vp->spotEllipse[2] = (float)((vpnode[0x1E] >> 16) & 0xFFFF);					// spotlight X size (16-bit)
		vp->spotEllipse[3] = (float)((vpnode[0x1D] >> 16) & 0xFFFF);					// spotlight Y size

		vp->spotRange[0] = 1.0f / Util::Uint32AsFloat(vpnode[0x21]);					// spotlight start
		vp->spotRange[1] = Util::Uint32AsFloat(vpnode[0x1F]);							// spotlight extent

		vp->spotColor[0] = color[spotColorIdx][0];										// spotlight color
		vp->spotColor[1] = color[spotColorIdx][1];
		vp->spotColor[2] = color[spotColorIdx][2];

		vp->spotFogColor[0] = color[spotFogColorIdx][0];								// spotlight color on fog
		vp->spotFogColor[1] = color[spotFogColorIdx][1];
		vp->spotFogColor[2] = color[spotFogColorIdx][2];

		// spotlight is specified in terms of physical resolution
		vp->spotEllipse[1] = 384.0f - vp->spotEllipse[1];								// flip Y position

		// Avoid division by zero
		vp->spotEllipse[2] = std::max(1.0f, vp->spotEllipse[2]);
		vp->spotEllipse[3] = std::max(1.0f, vp->spotEllipse[3]);

		vp->spotEllipse[2] = std::roundf(2047.0f / vp->spotEllipse[2]);
		vp->spotEllipse[3] = std::roundf(2047.0f / vp->spotEllipse[3]);

		// Scale the spotlight to the OpenGL viewport
		vp->spotEllipse[0] = vp->spotEllipse[0] * m_xRatio + (float)m_xOffs;
		vp->spotEllipse[1] = vp->spotEllipse[1] * m_yRatio + (float)m_yOffs;
		vp->spotEllipse[2] *= m_xRatio;
		vp->spotEllipse[3] *= m_yRatio;

		// Line of sight position
		vp->losPosX = (int)(((vpnode[0x1c] & 0xFFFF) / 16.0f) + 0.5f);					// x position
		vp->losPosY = (int)(((vpnode[0x1c] >> 16) / 16.0f) + 0.5f);						// y position 0 starts from the top

		// Fog
		vp->fogParams[0] = (float)((vpnode[0x22] >> 16) & 0xFF) * (float)(1.0 / 255.0);	// fog color R
		vp->fogParams[1] = (float)((vpnode[0x22] >> 8) & 0xFF) * (float)(1.0 / 255.0);	// fog color G
		vp->fogParams[2] = (float)((vpnode[0x22] >> 0) & 0xFF) * (float)(1.0 / 255.0);	// fog color B
		vp->fogParams[3] = std::abs(Util::Uint32AsFloat(vpnode[0x23]));					// fog density	- ocean hunter uses negative values, but looks the same
		vp->fogParams[4] = (float)(INT16)(vpnode[0x25] & 0xFFFF)* (float)(1.0 / 255.0);	// fog start

		// Avoid Infinite and NaN values for Star Wars Trilogy
		if (std::isinf(vp->fogParams[3]) || std::isnan(vp->fogParams[3])) {
			for (int i = 0; i < 7; i++) vp->fogParams[i] = 0.0f;
		}

		vp->fogParams[5] = (float)((vpnode[0x24] >> 16) & 0xFF) * (float)(1.0 / 255.0);	// fog attenuation
		vp->fogParams[6] = (float)((vpnode[0x25] >> 16) & 0xFF) * (float)(1.0 / 255.0);	// fog ambient

		vp->scrollFog = (float)(vpnode[0x20] & 0xFF) * (float)(1.0 / 255.0);				// scroll fog
		vp->scrollAtt = (float)(vpnode[0x24] & 0xFF) * (float)(1.0 / 255.0);				// scroll attenuation

		// Clear texture offsets before proceeding
		m_nodeAttribs.Reset();

		// Set up coordinate system and base matrix
		InitMatrixStack(matrixBase, m_modelMat);

		// Descend down the node link. Need to start with a culling node because that defines our culling radius.
		auto childptr = vpnode[0x02];
		if (((childptr >> 24) & 0x5) == 0) {
			DescendNodePtr(vpnode[0x02]);
		}
	}

	// render next viewport
	if (vpnode[0x01] != 0x01000000) {
		RenderViewport(vpnode[0x01]);
	}
}

void CNew3D::CopyVertexData(const R3DPoly& r3dPoly, std::vector<FVertex>& vertexArray)
{
	// both lemans 24 and dirt devils are rendering some totally transparent polys as the first object in each viewport
	// in dirt devils it's parallel to the camera so is completely invisible, but breaks our depth calculation
	// in lemans 24 its a sort of diamond shape, but never leaves a hole in the transparent geometry so must be being skipped by the h/w
	if (r3dPoly.faceColour[3] == 0) {
		return;
	}

	if (m_numPolyVerts==4) {
		if (r3dPoly.number == 4) {
			vertexArray.emplace_back(r3dPoly, 0);		// construct directly inside container without copy
			vertexArray.emplace_back(r3dPoly, 1);
			vertexArray.emplace_back(r3dPoly, 2);
			vertexArray.emplace_back(r3dPoly, 3);

			// check for identical points (ie forced triangle) and replace with average point
			// if we don't do this our quad code falls apart
			FVertex* v = (&vertexArray.back()) - 3;

			for (int i = 0; i < 4; i++) {

				int next1 = (i + 1) % 4;
				int next2 = (i + 2) % 4;

				if (FVertex::Equal(v[i], v[next1])) {
					FVertex::Average(v[next1], v[next2], v[next1]);
					break;
				}
			}
		}
		else {
			vertexArray.emplace_back(r3dPoly, 0);
			vertexArray.emplace_back(r3dPoly, 1);
			vertexArray.emplace_back(r3dPoly, 2);
			vertexArray.emplace_back(r3dPoly, 0, 2);	// last point is an average of 0 and 2
		}
	}
	else {
		vertexArray.emplace_back(r3dPoly, 0);
		vertexArray.emplace_back(r3dPoly, 1);
		vertexArray.emplace_back(r3dPoly, 2);

		if (r3dPoly.number == 4) {
			vertexArray.emplace_back(r3dPoly, 0);
			vertexArray.emplace_back(r3dPoly, 2);
			vertexArray.emplace_back(r3dPoly, 3);
		}
	}
}

void CNew3D::GetCoordinates(int width, int height, UINT16 uIn, UINT16 vIn, float uvScale, float& uOut, float& vOut)
{
	uOut = (uIn * uvScale) / width;
	vOut = (vIn * uvScale) / height;
}

int CNew3D::GetTexFormat(int originalFormat, bool contour)
{
	if (!contour) {
		return originalFormat;	// the same
	}

	switch (originalFormat)
	{
	case 1:
	case 2:
	case 3:
	case 4:
		return originalFormat + 7;		// these formats are identical to 1-4, except they lose the 4 bit alpha part when contour is enabled
	default:
		return originalFormat;
	}
}

void CNew3D::SetMeshValues(SortingMesh *currentMesh, PolyHeader &ph)
{
	//copy attributes
	currentMesh->textured		= ph.TexEnabled();
	currentMesh->alphaTest		= ph.AlphaTest();
	currentMesh->textureAlpha	= ph.TextureAlpha();
	currentMesh->polyAlpha		= ph.PolyAlpha();
	currentMesh->lighting		= ph.LightEnabled();
	currentMesh->fixedShading	= ph.FixedShading() && !ph.SmoothShading();
	currentMesh->highPriority	= ph.HighPriority();
	currentMesh->transLSelect	= ph.TranslucencyPatternSelect();
	currentMesh->layered		= ph.Layered();
	currentMesh->specular		= ph.SpecularEnabled();
	currentMesh->shininess		= ph.Shininess();
	currentMesh->specularValue	= ph.SpecularValue();
	currentMesh->fogIntensity	= ph.LightModifier();
	currentMesh->translatorMap	= ph.TranslatorMap();
	currentMesh->noLosReturn	= ph.NoLosReturn();

	if (currentMesh->textured) {

		currentMesh->format = GetTexFormat(ph.TexFormat(), ph.AlphaTest());

		if (currentMesh->format == 7) {
			currentMesh->alphaTest = false;	// alpha test is a 1 bit test, this format needs a lower threshold, since it has 16 levels of transparency
		}

		currentMesh->x				= ph.X();
		currentMesh->y				= ph.Y();
		currentMesh->width			= ph.TexWidth();
		currentMesh->height			= ph.TexHeight();
		currentMesh->microTexture	= ph.MicroTexture();
		currentMesh->inverted		= ph.TranslatorMapOffset() == 2;

		{
			bool smoothU = ph.TexSmoothU();
			bool smoothV = ph.TexSmoothV();

			if (ph.AlphaTest()) {
				smoothU = false;	// smooth wrap makes no sense for alpha tested polys with pixel dilate
				smoothV = false;
			}

			if (ph.TexUMirror()) {
				if (smoothU)	currentMesh->wrapModeU = Mesh::TexWrapMode::mirror;
				else			currentMesh->wrapModeU = Mesh::TexWrapMode::mirrorClamp;
			}
			else {
				if (smoothU)	currentMesh->wrapModeU = Mesh::TexWrapMode::repeat;
				else			currentMesh->wrapModeU = Mesh::TexWrapMode::repeatClamp;
			}

			if (ph.TexVMirror()) {
				if (smoothV)	currentMesh->wrapModeV = Mesh::TexWrapMode::mirror;
				else			currentMesh->wrapModeV = Mesh::TexWrapMode::mirrorClamp;
			}
			else {
				if (smoothV)	currentMesh->wrapModeV = Mesh::TexWrapMode::repeat;
				else			currentMesh->wrapModeV = Mesh::TexWrapMode::repeatClamp;
			}
		}

		if (currentMesh->microTexture) {

			static const float microTexScale[] = { 2.f, 4.f, 16.f, 256.f };

			currentMesh->microTextureID = ph.MicroTextureID();
			currentMesh->microTextureScale = microTexScale[ph.MicroTextureMinLOD()];
		}
	}
}

void CNew3D::CacheModel(Model *m, const UINT32 *data)
{
	if (data == NULL)
		return;

	UINT16			texCoords[4][2];
	PolyHeader		ph;
	UINT64			lastHash	= -1;
	SortingMesh*	currentMesh = nullptr;

	std::unordered_map<UINT64, SortingMesh> sMap;

	ph = data;
	int numTriangles = ph.NumTrianglesTotal();

	// Cache all polygons
	do {

		R3DPoly		p;					// current polygon
		float		uvScale;

		if (ph.header[6] == 0) {
			break;
		}

		// create a hash value based on poly attributes -todo add more attributes
		auto hash = ph.Hash();

		if (hash != lastHash) {

			if (sMap.count(hash) == 0) {

				currentMesh = &sMap.insert({hash, SortingMesh()}).first->second;

				//make space for our vertices
				currentMesh->verts.reserve(numTriangles * 3);

				//set mesh values
				SetMeshValues(currentMesh, ph);
			}
			else
				currentMesh = &sMap[hash];
		}

		// Obtain basic polygon parameters
		p.number	= ph.NumVerts();
		uvScale		= ph.UVScale();

		ph.FaceNormal(p.faceNormal);

		// Fetch reused vertices according to bitfield, then new verts
		int j = 0;
		for (int i = 0; i < 4; i++)		// up to 4 reused vertices
		{
			if (ph.SharedVertex(i))
			{
				p.v[j] = m_prev[i];

				texCoords[j][0] = m_prevTexCoords[i][0];
				texCoords[j][1] = m_prevTexCoords[i][1];

				//check if we need to recalc tex coords - will only happen if tex tiles are different + sharing vertices
				if (hash != lastHash) {
					if (currentMesh->textured) {
						GetCoordinates(currentMesh->width, currentMesh->height, texCoords[j][0], texCoords[j][1], uvScale, p.v[j].texcoords[0], p.v[j].texcoords[1]);
					}
				}

				j++;
			}
		}

		lastHash = hash;

		// copy face attributes

		if (!ph.PolyColor()) {
			int colorIdx = ph.ColorIndex();
			p.faceColour[2] = (m_polyRAM[m_colorTableAddr + colorIdx] & 0xFF);
			p.faceColour[1] = ((m_polyRAM[m_colorTableAddr + colorIdx] >> 8) & 0xFF);
			p.faceColour[0] = ((m_polyRAM[m_colorTableAddr + colorIdx] >> 16) & 0xFF);
		}
		else {
			p.faceColour[0] = ((ph.header[4] >> 24));
			p.faceColour[1] = ((ph.header[4] >> 16) & 0xFF);
			p.faceColour[2] = ((ph.header[4] >> 8) & 0xFF);
		}

		p.faceColour[3] = ph.Transparency();

		if (ph.Discard1() && !ph.Discard2()) {
			p.faceColour[3] /= 2;
		}

		// if we have flat shading, we can't re-use normals from shared vertices
		for (int i = 0; i < p.number && !ph.SmoothShading(); i++) {
			p.v[i].normal[0] = p.faceNormal[0];
			p.v[i].normal[1] = p.faceNormal[1];
			p.v[i].normal[2] = p.faceNormal[2];
		}

		UINT32* vData = ph.StartOfData();	// vertex data starts here

		// remaining vertices are new and defined here
		for (; j < p.number; j++)
		{
			// Fetch vertices
			UINT32 ix = vData[0];
			UINT32 iy = vData[1];
			UINT32 iz = vData[2];
			UINT32 it = vData[3];

			// Decode vertices
			p.v[j].pos[0] = (((INT32)ix) >> 8) * m_vertexFactor;
			p.v[j].pos[1] = (((INT32)iy) >> 8) * m_vertexFactor;
			p.v[j].pos[2] = (((INT32)iz) >> 8) * m_vertexFactor;
			p.v[j].pos[3] = 1.0f;

			// Per vertex normals
			if (ph.SmoothShading()) {
				p.v[j].normal[0] = BYTE_TO_FLOAT((INT8)(ix & 0xFF));
				p.v[j].normal[1] = BYTE_TO_FLOAT((INT8)(iy & 0xFF));
				p.v[j].normal[2] = BYTE_TO_FLOAT((INT8)(iz & 0xFF));
			}

			if (ph.FixedShading() && !ph.SmoothShading()) {			// fixed shading seems to be disabled if actual normals are set

				//==========
				float shade;
				//==========

				if (!m_shadeIsSigned) {
					shade = (ix & 0xFF) * (float)(1.0 / 255.0);
				}
				else {
					shade = BYTE_TO_FLOAT((INT8)(ix & 0xFF));
				}

				p.v[j].fixedShade = shade;
			}

			float texU = 0;
			float texV = 0;

			// tex coords
			if (currentMesh->textured) {
				GetCoordinates(currentMesh->width, currentMesh->height, (UINT16)(it >> 16), (UINT16)(it & 0xFFFF), uvScale, texU, texV);
			}

			p.v[j].texcoords[0] = texU;
			p.v[j].texcoords[1] = texV;

			//cache un-normalised tex coordinates
			texCoords[j][0] = (UINT16)(it >> 16);
			texCoords[j][1] = (UINT16)(it & 0xFFFF);

			vData += 4;
		}

		// check if we need to double up vertices for two sided lighting
		if (ph.DoubleSided() && !ph.Discard()) {

			R3DPoly tempP = p;

			// flip normals
			V3::inverse(tempP.faceNormal);

			for (int i2 = 0; i2 < tempP.number; i2++) {
				V3::inverse(tempP.v[i2].normal);
			}

			CopyVertexData(tempP, currentMesh->verts);
		}

		// Copy this polygon into the model buffer
		if (!ph.Discard()) {
			CopyVertexData(p, currentMesh->verts);
		}

		// Copy current vertices into previous vertex array
		for (int i = 0; i < 4; i++) {
			m_prev[i] = p.v[i];
			m_prevTexCoords[i][0] = texCoords[i][0];
			m_prevTexCoords[i][1] = texCoords[i][1];
		}

	} while (ph.NextPoly());

	//sorted the data, now copy to main data structures

	// we know how many meshes we have to reserve appropriate space
	m->meshes->reserve(sMap.size());

	for (auto& it : sMap) {

		if (m->dynamic) {

			// calculate VBO values for current mesh
			it.second.vboOffset		= (int)m_polyBufferRam.size() + MAX_ROM_VERTS;
			it.second.vertexCount	= (int)it.second.verts.size();

			// copy poly data to main buffer
			m_polyBufferRam.insert(m_polyBufferRam.end(), it.second.verts.begin(), it.second.verts.end());
		}
		else {
			// calculate VBO values for current mesh
			it.second.vboOffset		= (int)m_polyBufferRom.size();
			it.second.vertexCount	= (int)it.second.verts.size();

			// copy poly data to main buffer
			m_polyBufferRom.insert(m_polyBufferRom.end(), it.second.verts.begin(), it.second.verts.end());
		}

		//copy the temp mesh into the model structure
		//this will lose the associated vertex data, which is now copied to the main buffer anyway
		m->meshes->push_back(it.second);
	}
}

bool CNew3D::IsDynamicModel(UINT32 *data)
{
	if (data == NULL) {
		return false;
	}

	PolyHeader p(data);

	do {

		if ((p.header[1] & 2) == 0) {		// model has rgb colour palette
			return true;
		}

		if (p.header[6] == 0) {
			break;
		}

	} while (p.NextPoly());

	return false;
}

bool CNew3D::IsVROMModel(UINT32 modelAddr)
{
	return modelAddr >= 0x100000;
}

void CNew3D::CalcFrustumPlanes(Plane p[5], const float* matrix)
{
	// Left Plane
	p[0].a = matrix[3] + matrix[0];
	p[0].b = matrix[7] + matrix[4];
	p[0].c = matrix[11] + matrix[8];
	p[0].d = matrix[15] + matrix[12];
	p[0].Normalise();

	// Right Plane
	p[1].a = matrix[3] - matrix[0];
	p[1].b = matrix[7] - matrix[4];
	p[1].c = matrix[11] - matrix[8];
	p[1].d = matrix[15] - matrix[12];
	p[1].Normalise();

	// Bottom Plane
	p[2].a = matrix[3] + matrix[1];
	p[2].b = matrix[7] + matrix[5];
	p[2].c = matrix[11] + matrix[9];
	p[2].d = matrix[15] + matrix[13];
	p[2].Normalise();

	// Top Plane
	p[3].a = matrix[3] - matrix[1];
	p[3].b = matrix[7] - matrix[5];
	p[3].c = matrix[11] - matrix[9];
	p[3].d = matrix[15] - matrix[13];
	p[3].Normalise();

	// Front Plane
	p[4].a = 0.f;
	p[4].b = 0.f;
	p[4].c = -1.f;
	p[4].d = 0.f;
}

void CNew3D::CalcBox(float distance, BBox& box)
{
	//bottom left front
	box.points[0][0] = -distance;
	box.points[0][1] = -distance;
	box.points[0][2] = distance;
	box.points[0][3] = 1.f;

	//bottom left back
	box.points[1][0] = -distance;
	box.points[1][1] = -distance;
	box.points[1][2] = -distance;
	box.points[1][3] = 1.f;

	//bottom right back
	box.points[2][0] = distance;
	box.points[2][1] = -distance;
	box.points[2][2] = -distance;
	box.points[2][3] = 1.f;

	//bottom right front
	box.points[3][0] = distance;
	box.points[3][1] = -distance;
	box.points[3][2] = distance;
	box.points[3][3] = 1.f;

	//top left front
	box.points[4][0] = -distance;
	box.points[4][1] = distance;
	box.points[4][2] = distance;
	box.points[4][3] = 1.f;

	//top left back
	box.points[5][0] = -distance;
	box.points[5][1] = distance;
	box.points[5][2] = -distance;
	box.points[5][3] = 1.f;

	//top right back
	box.points[6][0] = distance;
	box.points[6][1] = distance;
	box.points[6][2] = -distance;
	box.points[6][3] = 1.f;

	//top right front
	box.points[7][0] = distance;
	box.points[7][1] = distance;
	box.points[7][2] = distance;
	box.points[7][3] = 1.f;
}

void CNew3D::MultVec(const float matrix[16], const float in[4], float out[4])
{
	for (int i = 0; i < 4; i++) {
		out[i] =
			in[0] * matrix[0 * 4 + i] +
			in[1] * matrix[1 * 4 + i] +
			in[2] * matrix[2 * 4 + i] +
			in[3] * matrix[3 * 4 + i];
	}
}

void CNew3D::TransformBox(const float *m, BBox& box)
{
	for (int i = 0; i < 8; i++) {
		float v[4];
		MultVec(m, box.points[i], v);
		box.points[i][0] = v[0];
		box.points[i][1] = v[1];
		box.points[i][2] = v[2];
	}
}

Clip CNew3D::ClipBox(const BBox& box, Plane planes[5])
{
	int count = 0;

	for (int i = 0; i < 8; i++) {

		int temp = 0;

		for (int j = 0; j < 5; j++) {
			if (planes[j].DistanceToPoint(box.points[i]) >= 0.f) {
				temp++;
			}
		}

		if (temp == 5) count++;		// point is inside all 4 frustum planes
	}

	if (count == 8)	return Clip::INSIDE;
	if (count > 0)	return Clip::INTERCEPT;

	//if we got here all points are outside of the view frustum
	//check for all points being side same of any plane, means box outside of view

	for (int i = 0; i < 5; i++) {

		int temp = 0;

		for (int j = 0; j < 8; j++) {
			if (planes[i].DistanceToPoint(box.points[j]) >= 0.f) {
				temp++;
			}
		}

		if (temp == 0) {
			return Clip::OUTSIDE;
		}
	}

	//if we got here, box is traversing view frustum

	return Clip::INTERCEPT;
}

void CNew3D::CalcBoxExtents(const BBox& box)
{
	for (int i = 0; i < 8; i++) {
		if (box.points[i][2] < 0.f) {
			m_nfPairs[m_currentPriority].zNear = std::max(box.points[i][2], m_nfPairs[m_currentPriority].zNear);
			m_nfPairs[m_currentPriority].zFar  = std::min(box.points[i][2], m_nfPairs[m_currentPriority].zFar);
		}
	}
}

void CNew3D::ClipPolygon(ClipPoly& clipPoly, Plane planes[5])
{
	//============
	ClipPoly temp;
	ClipPoly *in;
	ClipPoly *out;
	//============

	in = &clipPoly;
	out = &temp;

	for (int i = 0; i < 4; i++) {

		//=================
		bool	currentIn;
		float	currentDot;
		//=================

		currentDot	= planes[i].DotProduct(in->list[0].pos);
		currentIn	= (currentDot + planes[i].d) >= 0.f;
		out->count	= 0;

		for (int j = 0; j < in->count; j++) {

			if (currentIn) {
				out->list[out->count] = in->list[j];
				out->count++;
			}

			int nextIndex = j + 1;
			if (nextIndex >= in->count) {
				nextIndex = 0;
			}

			float nextDot = planes[i].DotProduct(in->list[nextIndex].pos);
			bool nextIn	= (nextDot + planes[i].d) >= 0.f;

			// we have an intersection
			if (currentIn != nextIn) {

				float u = (currentDot + planes[i].d) / (currentDot - nextDot);

				const float* p1 = in->list[j].pos;
				const float* p2 = in->list[nextIndex].pos;

				out->list[out->count].pos[0] = p1[0] + ((p2[0] - p1[0]) * u);
				out->list[out->count].pos[1] = p1[1] + ((p2[1] - p1[1]) * u);
				out->list[out->count].pos[2] = p1[2] + ((p2[2] - p1[2]) * u);
				out->count++;
			}

			currentDot = nextDot;
			currentIn = nextIn;
		}

		std::swap(in, out);
	}
}

void CNew3D::ClipModel(const Model *m)
{
	//===============================
	ClipPoly				clipPoly;
	std::vector<FVertex>*	vertices;
	int						offset;
	//===============================

	if (m->dynamic) {
		vertices = &m_polyBufferRam;
		offset = MAX_ROM_VERTS;
	}
	else {
		vertices = &m_polyBufferRom;
		offset = 0;
	}

	for (const auto &mesh : *m->meshes) {

		int start = mesh.vboOffset - offset;

		for (int i = 0; i < mesh.vertexCount; i += m_numPolyVerts) {							// inc to next poly

			for (int j = 0; j < m_numPolyVerts; j++) {
				MultVec(m->modelMat, (*vertices)[start + i + j].pos, clipPoly.list[j].pos);		// copy all 3 of 4  our transformed vertices into our clip poly struct
			}

			clipPoly.count = m_numPolyVerts;

			ClipPolygon(clipPoly, m_planes);

			for (int j = 0; j < clipPoly.count; j++) {
				if (clipPoly.list[j].pos[2] < 0.f) {
					m_nfPairs[m_currentPriority].zNear = std::max(clipPoly.list[j].pos[2], m_nfPairs[m_currentPriority].zNear);
					m_nfPairs[m_currentPriority].zFar  = std::min(clipPoly.list[j].pos[2], m_nfPairs[m_currentPriority].zFar);
				}
			}
		}
	}
}

void CNew3D::CalcViewport(Viewport* vp, float near, float far)
{
	if (far > 1e30f) {
		far = near * 1000000.f;				// fix for ocean hunter which passes some FLT_MAX for a few matrices. HW must have some safe guard for these
	}

	if (near < far / 1000000.f) {
		near = far / 1000000.f;				// if we get really close to zero somehow, we will have almost no depth precision
	}

	float l = near * vp->angle_left;	// we need to calc the shape of the projection frustum for culling
	float r = near * vp->angle_right;
	float t = near * vp->angle_top;
	float b = near * vp->angle_bottom;

	vp->projectionMatrix.LoadIdentity();	// reset matrix

	if ((vp->vpX == 0) && (vp->vpWidth >= 495) && (vp->vpY == 0) && (vp->vpHeight >= 383)) {

		/*
		 * Compute aspect ratio correction factor. "Window" refers to the full GL
		 * viewport (i.e., totalXRes x totalYRes). "Viewable area" is the effective
		 * Model 3 screen (xRes x yRes). In non-wide-screen, non-stretch mode, this
		 * is intended to replicate the 496x384 display and may in general be
		 * smaller than the window. The rest of the window appears to have a
		 * border, which is created by a scissor box.
		 *
		 * In wide-screen mode, we want to expand the frustum horizontally to fill
		 * the window. We want the aspect ratio to be correct. To accomplish this,
		 * the viewable area is set *the same* as in non-wide-screen mode (e.g.,
		 * often smaller than the window) but glScissor() is set by the OSD layer's
		 * screen setup code to reveal the entire window.
		 *
		 * In stretch mode, the window and viewable area are both set the same,
		 * which means there will be no aspect ratio correction and the display
		 * will stretch to fill the entire window while keeping the view frustum
		 * the same as a 496x384 Model 3 display. The display will be distorted.
		 */
		float windowAR = (float)m_totalXRes / (float)m_totalYRes;
		float viewableAreaAR = (float)m_xRes / (float)m_yRes;

		// Will expand horizontal frustum planes only in non-stretch mode (wide-
		// screen and non-wide-screen modes have identical resolution parameters
		// and only their scissor box differs)
		float correction = windowAR / viewableAreaAR;

		vp->x		= 0;
		vp->y		= m_yOffs + (int)((float)(384 - (vp->vpY + vp->vpHeight))*m_yRatio);
		vp->width	= m_totalXRes;
		vp->height = (int)((float)vp->vpHeight*m_yRatio);

		vp->projectionMatrix.Frustum(l*correction, r*correction, b, t, near, far);
	}
	else {

		vp->x		= m_xOffs + (int)((float)vp->vpX*m_xRatio);
		vp->y		= m_yOffs + (int)((float)(384 - (vp->vpY + vp->vpHeight))*m_yRatio);
		vp->width	= (int)((float)vp->vpWidth*m_xRatio);
		vp->height	= (int)((float)vp->vpHeight*m_yRatio);

		vp->projectionMatrix.Frustum(l, r, b, t, near, far);
	}
}

void CNew3D::SetSunClamp(bool enable)
{
	m_sunClamp = enable;
	m_sunClamp = true;
}

void CNew3D::SetSignedShade(bool enable)
{
	m_shadeIsSigned = enable;
}

float CNew3D::GetLosValue(int layer)
{
	// we always write to the 'back' buffer, and the software reads from the front
	// then they get swapped
	std::lock_guard<std::mutex> guard(m_losMutex);
	return m_losFront->value[layer];
}

void CNew3D::TranslateLosPosition(int inX, int inY, int& outX, int& outY)
{
	// remap real3d 496x384 to our new viewport
	inY = 384 - inY;

	outX = m_xOffs + int(inX * m_xRatio);
	outY = m_yOffs + int(inY * m_yRatio);
}

bool CNew3D::ProcessLos(int priority)
{
	for (const auto &n : m_nodes) {
		if (n.viewport.priority == priority) {
			if (n.viewport.losPosX || n.viewport.losPosY) {

				int losX, losY;
				TranslateLosPosition(n.viewport.losPosX, n.viewport.losPosY, losX, losY);

				float depth;
				glReadPixels(losX, losY, 1, 1, GL_DEPTH_COMPONENT, GL_FLOAT, &depth);

				depth = 2.0f * depth - 1.0f;

				float zNear = m_nfPairs[priority].zNear;
				float zFar	= m_nfPairs[priority].zFar;
				float zVal	= 2.0f * zNear * zFar / (zFar + zNear - depth * (zFar - zNear));

				// real3d test program indicates that return values are 1/zVal
				zVal = 1.0f / zVal;

				GLubyte stencilVal;
				glReadPixels(losX, losY, 1, 1, GL_STENCIL_INDEX, GL_UNSIGNED_BYTE, &stencilVal);

				// apply our mask to stencil, because layered poly attributes use the lower bits
				stencilVal &= 0x80;

				// if the stencil val is zero that means we've hit sky or whatever, if it hits a 1 we've hit geometry
				// the real3d returns 1 in the top bit of the float if the line of sight test passes (ie doesn't hit geometry)

				auto zValP = reinterpret_cast<unsigned char*>(&zVal);	// this is legal in c++, casting to int technically isn't

				if (stencilVal == 0) {
					zValP[0] |= 1;		// set first bit to 1
				}
				else {
					zValP[0] &= 0xFE;	// set first bit to zero
				}

				m_losBack->value[priority] = zVal;

				return true;
			}
		}
	}

	return false;
}

} // New3D