/*****************************************************************************
The Dark Mod GPL Source Code

This file is part of the The Dark Mod Source Code, originally based
on the Doom 3 GPL Source Code as published in 2011.

The Dark Mod Source Code is free software: you can redistribute it
and/or modify it under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the License,
or (at your option) any later version. For details, see LICENSE.TXT.

Project: The Dark Mod (http://www.thedarkmod.com/)

******************************************************************************/
#include "precompiled.h"
#include "ParallelJobList.h"

/*
================================================================================================

	Job and Job-List names

================================================================================================
*/

// stgatilov: GetSystemTimePreciseAsFileTime has too much overhead
// luckily, these time measurements are only used for computing thread-private deltas
// and they are only used for diagnostics, so they can be a bit inexact...
#if 1
	#define JobTimer_Clock uint64( Sys_GetClockTicks() )
	#define JobTimer_Frequency uint64( Sys_ClockTicksPerSecond() )
#else
	#define JobTimer_Clock Sys_GetTimeMicroseconds()
	#define JobTimer_Frequency 1000000ull
#endif

const char * jobNames[] = {
	ASSERT_ENUM_STRING( JOBLIST_RENDERER_FRONTEND,	0 ),
//	ASSERT_ENUM_STRING( JOBLIST_RENDERER_BACKEND,	1 ),
	ASSERT_ENUM_STRING( JOBLIST_UTILITY,			9 ),
};

static const int MAX_REGISTERED_JOBS = 128;
struct registeredJob {
	jobRun_t		function;
	const char *	name;
} registeredJobs[MAX_REGISTERED_JOBS];
static int numRegisteredJobs;

const char * GetJobListName( jobListId_t id ) {
	return jobNames[id];
}

/*
========================
IsRegisteredJob
========================
*/
static bool IsRegisteredJob( jobRun_t function ) {
	for ( int i = 0; i < numRegisteredJobs; i++ ) {
		if ( registeredJobs[i].function == function ) {
			return true;
		}
	}
	return false;
}

/*
========================
RegisterJob
========================
*/
void RegisterJob( jobRun_t function, const char * name ) {
	if ( IsRegisteredJob( function ) ) {
		return;
	}
	registeredJobs[numRegisteredJobs].function = function;
	registeredJobs[numRegisteredJobs].name = name;
	numRegisteredJobs++;
}

/*
========================
GetJobName
========================
*/
const char * GetJobName( jobRun_t function ) {
	for ( int i = 0; i < numRegisteredJobs; i++ ) {
		if ( registeredJobs[i].function == function ) {
			return registeredJobs[i].name;
		}
	}
	return "unknown";
}

/*
========================
idParallelJobRegistration::idParallelJobRegistration
========================
*/
idParallelJobRegistration::idParallelJobRegistration( jobRun_t function, const char * name ) {
	RegisterJob( function, name );
}

int globalSpuLocalStoreActive;
void * globalSpuLocalStoreStart;
void * globalSpuLocalStoreEnd;
idSysMutex globalSpuLocalStoreMutex;

/*
================================================================================================

	PS3

================================================================================================
*/


/*
================================================================================================

idParallelJobList_Threads

================================================================================================
*/

static idCVar jobs_longJobMicroSec( "jobs_longJobMicroSec", "100000", CVAR_INTEGER, "print a warning for jobs that take more than this number of microseconds" );


const static int		MAX_THREADS	= 32;

struct threadJobListState_t {
								threadJobListState_t() :
									jobList( NULL ),
									version( 0xFFFFFFFF ),
									signalIndex( 0 ),
									lastJobIndex( 0 ),
									nextJobIndex( -1 ) {}
								threadJobListState_t( int _version ) :
									jobList( NULL ),
									version( _version ),
									signalIndex( 0 ),
									lastJobIndex( 0 ),
									nextJobIndex( -1 ) {}
	idParallelJobList_Threads *	jobList;
	int							version;
	int							signalIndex;
	int							lastJobIndex;
	int							nextJobIndex;
};

struct threadStats_t {
	unsigned int	numExecutedJobs;
	unsigned int	numExecutedSyncs;
	uint64			submitTime;
	uint64			startTime;
	uint64			endTime;
	uint64			waitTime;
	uint64			threadExecTime[MAX_THREADS];
	uint64			threadTotalTime[MAX_THREADS];
};

class idParallelJobList_Threads {
public:
							idParallelJobList_Threads( jobListId_t id, jobListPriority_t priority, unsigned int maxJobs, unsigned int maxSyncs );
							~idParallelJobList_Threads();

	//------------------------
	// These are called from the one thread that manages this list.
	//------------------------
	ID_INLINE void			AddJob( jobRun_t function, void * data );
	ID_INLINE void			InsertSyncPoint( jobSyncType_t syncType );
	void					Submit( idParallelJobList_Threads * waitForJobList_, int parallelism );
	void					Wait();
	bool					TryWait();
	bool					IsSubmitted() const;

	unsigned int			GetNumExecutedJobs() const { return threadStats.numExecutedJobs; }
	unsigned int			GetNumSyncs() const { return threadStats.numExecutedSyncs; }
	uint64					GetSubmitTimeMicroSec() const { return TimerClockToMicrosec( threadStats.submitTime ); }
	uint64					GetStartTimeMicroSec() const { return TimerClockToMicrosec( threadStats.startTime ); }
	uint64					GetFinishTimeMicroSec() const { return TimerClockToMicrosec( threadStats.endTime ); }
	uint64					GetWaitTimeMicroSec() const { return TimerClockToMicrosec( threadStats.waitTime ); }
	uint64					GetTotalProcessingTimeMicroSec() const;
	uint64					GetTotalWastedTimeMicroSec() const;
	uint64					GetUnitProcessingTimeMicroSec( int unit ) const;
	uint64					GetUnitWastedTimeMicroSec( int unit ) const;
	uint64					TimerClockToMicrosec( uint64 durationClocks ) const;

	jobListId_t				GetId() const { return listId; }
	jobListPriority_t		GetPriority() const { return listPriority; }
	int						GetVersion() { return version.GetValue(); }

	bool					WaitForOtherJobList();

	//------------------------
	// This is thread safe and called from the job threads.
	//------------------------
	enum runResult_t {
		RUN_OK			= 0,
		RUN_PROGRESS	= BIT( 0 ),
		RUN_DONE		= BIT( 1 ),
		RUN_STALLED		= BIT( 2 )
	};

	int						RunJobs( unsigned int threadNum, threadJobListState_t & state, bool singleJob );

private:
	static const int		NUM_DONE_GUARDS = 4;	// cycle through 4 guards so we can cyclicly chain job lists

	bool					threaded;
	bool					done;
	bool					hasSignal;
	jobListId_t				listId;
	jobListPriority_t		listPriority;
	unsigned int			maxJobs;
	unsigned int			maxSyncs;
	unsigned int			numSyncs;
	int						lastSignalJob;
	idSysInterlockedInteger * waitForGuard;
	idSysInterlockedInteger doneGuards[NUM_DONE_GUARDS];
	int						currentDoneGuard;
	idSysInterlockedInteger	version;
	struct job_t {
		jobRun_t	function;
		void *		data;
		int			executed;
	};
	idList< job_t >		jobList;
	idList< idSysInterlockedInteger >	signalJobCount;
	idSysInterlockedInteger				currentJob;
	idSysInterlockedInteger				fetchLock;
	idSysInterlockedInteger				numThreadsExecuting;

	threadStats_t						deferredThreadStats;
	threadStats_t						threadStats;
	uint64								timerFrequency;

	int						RunJobsInternal( unsigned int threadNum, threadJobListState_t & state, bool singleJob );

	static void				Nop( void * data ) {}

	static int				JOB_SIGNAL;
	static int				JOB_SYNCHRONIZE;
	static int				JOB_LIST_DONE;
};

int idParallelJobList_Threads::JOB_SIGNAL;
int idParallelJobList_Threads::JOB_SYNCHRONIZE;
int idParallelJobList_Threads::JOB_LIST_DONE;

/*
========================
idParallelJobList_Threads::idParallelJobList_Threads
========================
*/
idParallelJobList_Threads::idParallelJobList_Threads( jobListId_t id, jobListPriority_t priority, unsigned int maxJobs, unsigned int maxSyncs ) :
	threaded( true ),
	done( true ),
	hasSignal( false ),
	listId( id ),
	listPriority( priority ),
	numSyncs( 0 ),
	lastSignalJob( 0 ),
	waitForGuard( NULL ),
	currentDoneGuard( 0 ),
	jobList() {

	assert( listPriority != JOBLIST_PRIORITY_NONE );

	this->maxJobs = maxJobs;
	this->maxSyncs = maxSyncs;
	jobList.AssureSize( maxJobs + maxSyncs * 2 + 1 );	// syncs go in as dummy jobs and one more to update the doneCount
	jobList.SetNum( 0 );
	signalJobCount.AssureSize( maxSyncs + 1 );			// need one extra for submit
	signalJobCount.SetNum( 0 );

	memset( &deferredThreadStats, 0, sizeof( threadStats_t ) );
	memset( &threadStats, 0, sizeof( threadStats_t ) );
	timerFrequency = JobTimer_Frequency;
}

/*
========================
idParallelJobList_Threads::~idParallelJobList_Threads
========================
*/
idParallelJobList_Threads::~idParallelJobList_Threads() {
	Wait();
}

static idCVarBool jobs_debugCheck( "jobs_debugCheck", "0", CVAR_TOOL, "check job data integrity" );

/*
========================
idParallelJobList_Threads::AddJob
========================
*/
ID_INLINE void idParallelJobList_Threads::AddJob( jobRun_t function, void * data ) {
	assert( done );
	if ( int(maxJobs) == jobList.Num() ) {
		static int runOnce = []() {
			common->Warning( "idParallelJobList_Threads overflow\n" );
			return 0;
		} ( );
		return;
	}
	// make sure there isn't already a job with the same function and data in the list
	if ( jobs_debugCheck ) {	
		for ( int i = 0; i < jobList.Num(); i++ ) {
			//assert( jobList[i].function != function || jobList[i].data != data );
			if ( jobList[i].function != function || jobList[i].data != data )
				; // ok
			else
				common->Warning( "jobs_debugCheck failed\n" );
		}
	}
//	if ( 1 ) { // JDC: this never worked in tech5!  !jobList.IsFull() ) {
#if 1
		job_t & job = jobList.Alloc();
		job.function = function;
		job.data = data;
		job.executed = 0;
//	} else {
#else
		// debug output to show us what is overflowing
		int currentJobCount[MAX_REGISTERED_JOBS] = {};
		
		for ( int i = 0; i < jobList.Num(); ++i ) {
			const char * jobName = GetJobName( jobList[ i ].function );
			for ( int j = 0; j < numRegisteredJobs; ++j ) {
				if ( jobName == registeredJobs[ j ].name ) {
					currentJobCount[ j ]++;
					break;
				}
			}
		}

		// print the quantity of each job type
		for ( int i = 0; i < numRegisteredJobs; ++i ) {
			if ( currentJobCount[ i ] > 0 ) {
				idLib::Printf( "Job: %s, # %d", registeredJobs[ i ].name, currentJobCount[ i ] );
			}
		}
		idLib::Error( "Can't add job '%s', too many jobs %d", GetJobName( function ), jobList.Num() );
//	}
#endif
}

/*
========================
idParallelJobList_Threads::InsertSyncPoint
========================
*/
ID_INLINE void idParallelJobList_Threads::InsertSyncPoint( jobSyncType_t syncType ) {
	assert( done );
	switch( syncType ) {
		case SYNC_SIGNAL: {
			assert( !hasSignal );
			if ( jobList.Num() ) {
				assert( !hasSignal );
				signalJobCount.Alloc();
				signalJobCount[signalJobCount.Num() - 1].SetValue( jobList.Num() - lastSignalJob );
				lastSignalJob = jobList.Num();
				job_t & job = jobList.Alloc();
				job.function = Nop;
				job.data = & JOB_SIGNAL;
				hasSignal = true;
			}
			break;
		}
		case SYNC_SYNCHRONIZE: {
			if ( hasSignal ) {
				job_t & job = jobList.Alloc();
				job.function = Nop;
				job.data = & JOB_SYNCHRONIZE;
				hasSignal = false;
				numSyncs++;
			}
			break;
		}
	}
}

/*
========================
idParallelJobList_Threads::Submit
========================
*/
void idParallelJobList_Threads::Submit( idParallelJobList_Threads * waitForJobList, int parallelism ) {
	assert( done );
	assert( numSyncs <= maxSyncs );
	assert( (unsigned int) jobList.Num() <= maxJobs + numSyncs * 2 );
	assert( fetchLock.GetValue() == 0 );

	done = false;
	currentJob.SetValue( 0 );

	memset( &deferredThreadStats, 0, sizeof( deferredThreadStats ) );
	deferredThreadStats.numExecutedJobs = jobList.Num() - numSyncs * 2;
	deferredThreadStats.numExecutedSyncs = numSyncs;
	deferredThreadStats.submitTime = JobTimer_Clock;
	deferredThreadStats.startTime = 0;
	deferredThreadStats.endTime = 0;
	deferredThreadStats.waitTime = 0;

	if ( jobList.Num() == 0 ) {
		return;
	}

	if ( waitForJobList != NULL ) {
		waitForGuard = & waitForJobList->doneGuards[waitForJobList->currentDoneGuard];
	} else {
		waitForGuard = NULL;
	}

	currentDoneGuard = ( currentDoneGuard + 1 ) & ( NUM_DONE_GUARDS - 1 );
	doneGuards[currentDoneGuard].SetValue( 1 );

	signalJobCount.Alloc();
	signalJobCount[signalJobCount.Num() - 1].SetValue( jobList.Num() - lastSignalJob );

	job_t & job = jobList.Alloc();
	job.function = Nop;
	job.data = & JOB_LIST_DONE;

	if ( threaded ) {
		// hand over to the manager
		void SubmitJobList( idParallelJobList_Threads * jobList, int parallelism );
		SubmitJobList( this, parallelism );
	} else {
		// run all the jobs right here
		threadJobListState_t state( GetVersion() );
		RunJobs( 0, state, false );
	}
}

/*
========================
idParallelJobList_Threads::Wait
========================
*/
void idParallelJobList_Threads::Wait() {
	if ( jobList.Num() > 0 ) {
		// don't lock up but return if the job list was never properly submitted
		if ( !verify( !done && signalJobCount.Num() > 0 ) ) {
			return;
		}

		bool waited = false;
		uint64 waitStart = JobTimer_Clock;

		while ( signalJobCount[signalJobCount.Num() - 1].GetValue() > 0 ) {
			Sys_Yield();
			waited = true;
		}
		version.Increment();
		while ( numThreadsExecuting.GetValue() > 0 ) {
			Sys_Yield();
			waited = true;
		}

		jobList.Clear();
		signalJobCount.Clear();
		numSyncs = 0;
		lastSignalJob = 0;

		uint64 waitEnd = JobTimer_Clock;
		deferredThreadStats.waitTime = waited ? ( waitEnd - waitStart ) : 0;
	}
	memcpy( & threadStats, & deferredThreadStats, sizeof( threadStats ) );
	done = true;
}

/*
========================
idParallelJobList_Threads::TryWait
========================
*/
bool idParallelJobList_Threads::TryWait() {
	if ( jobList.Num() == 0 || signalJobCount[signalJobCount.Num() - 1].GetValue() <= 0 ) {
		Wait();
		return true;
	}
	return false;
}

/*
========================
idParallelJobList_Threads::IsSubmitted
========================
*/
bool idParallelJobList_Threads::IsSubmitted() const {
	return !done;
}

/*
========================
idParallelJobList_Threads::GetTotalProcessingTimeMicroSec
========================
*/
uint64 idParallelJobList_Threads::GetTotalProcessingTimeMicroSec() const {
	uint64 total = 0;
	for ( int unit = 0; unit < MAX_THREADS; unit++ ) {
		total += threadStats.threadExecTime[unit];
	}
	return TimerClockToMicrosec( total );
}

/*
========================
idParallelJobList_Threads::GetTotalWastedTimeMicroSec
========================
*/
uint64 idParallelJobList_Threads::GetTotalWastedTimeMicroSec() const {
	uint64 total = 0;
	for ( int unit = 0; unit < MAX_THREADS; unit++ ) {
		total += threadStats.threadTotalTime[unit] - threadStats.threadExecTime[unit];
	}
	return TimerClockToMicrosec( total );
}

/*
========================
idParallelJobList_Threads::GetUnitProcessingTimeMicroSec
========================
*/
uint64 idParallelJobList_Threads::GetUnitProcessingTimeMicroSec( int unit ) const {
	if ( unit < 0 || unit >= MAX_THREADS ) {
		return 0;
	}
	return TimerClockToMicrosec( threadStats.threadExecTime[unit] );
}

/*
========================
idParallelJobList_Threads::GetUnitWastedTimeMicroSec
========================
*/
uint64 idParallelJobList_Threads::GetUnitWastedTimeMicroSec( int unit ) const {
	if ( unit < 0 || unit >= MAX_THREADS ) {
		return 0;
	}
	return TimerClockToMicrosec( threadStats.threadTotalTime[unit] - threadStats.threadExecTime[unit] );
}

/*
========================
idParallelJobList_Threads::TimerClockToMicrosec
========================
*/
uint64 idParallelJobList_Threads::TimerClockToMicrosec( uint64 durationClocks ) const {
	// on 5 GHz CPU it overflows in > 1 hour
	// more than enough for one joblist to finish
	assert( durationClocks < UINT64_MAX / 1000000u );
	return durationClocks * 1000000u / timerFrequency;
}

#ifndef _DEBUG
volatile float longJobTime;
volatile jobRun_t longJobFunc;
volatile void * longJobData;
#endif

/*
========================
idParallelJobList_Threads::RunJobsInternal
========================
*/
int idParallelJobList_Threads::RunJobsInternal( unsigned int threadNum, threadJobListState_t & state, bool singleJob ) {
	if ( state.version != version.GetValue() ) {
		// trying to run an old version of this list that is already done
		return RUN_DONE;
	}

	assert( threadNum < MAX_THREADS );

	if ( deferredThreadStats.startTime == 0 ) {
		deferredThreadStats.startTime = JobTimer_Clock;	// first time any thread is running jobs from this list
	}

	int result = RUN_OK;

	do {

		// run through all signals and syncs before the last job that has been or is being executed
		// this loop is really an optimization to minimize the time spent in the fetchLock section below
		for ( ; state.lastJobIndex < (int) currentJob.GetValue() && state.lastJobIndex < jobList.Num(); state.lastJobIndex++ ) {
			if ( jobList[state.lastJobIndex].data == & JOB_SIGNAL ) {
				state.signalIndex++;
				assert( state.signalIndex < signalJobCount.Num() );
			} else if ( jobList[state.lastJobIndex].data == & JOB_SYNCHRONIZE ) {
				assert( state.signalIndex > 0 );
				if ( signalJobCount[state.signalIndex - 1].GetValue() > 0 ) {
					// stalled on a synchronization point
					return ( result | RUN_STALLED );
				}
			} else if ( jobList[state.lastJobIndex].data == & JOB_LIST_DONE ) {
				if ( signalJobCount[signalJobCount.Num() - 1].GetValue() > 0 ) {
					// stalled on a synchronization point
					return ( result | RUN_STALLED );
				}
			}
		}

		// try to lock to fetch a new job
		if ( fetchLock.Increment() == 1 ) {

			// grab a new job
			state.nextJobIndex = currentJob.Increment() - 1;

			// run through any remaining signals and syncs (this should rarely iterate more than once)
			for ( ; state.lastJobIndex <= state.nextJobIndex && state.lastJobIndex < jobList.Num(); state.lastJobIndex++ ) {
				if ( jobList[state.lastJobIndex].data == & JOB_SIGNAL ) {
					state.signalIndex++;
					assert( state.signalIndex < signalJobCount.Num() );
				} else if ( jobList[state.lastJobIndex].data == & JOB_SYNCHRONIZE ) {
					assert( state.signalIndex > 0 );
					if ( signalJobCount[state.signalIndex - 1].GetValue() > 0 ) {
						// return this job to the list
						currentJob.Decrement();
						// release the fetch lock
						fetchLock.Decrement();
						// stalled on a synchronization point
						return ( result | RUN_STALLED );
					}
				} else if ( jobList[state.lastJobIndex].data == & JOB_LIST_DONE ) {
					if ( signalJobCount[signalJobCount.Num() - 1].GetValue() > 0 ) {
						// return this job to the list
						currentJob.Decrement();
						// release the fetch lock
						fetchLock.Decrement();
						// stalled on a synchronization point
						return ( result | RUN_STALLED );
					}
					// decrement the done count
					doneGuards[currentDoneGuard].Decrement();
				}
			}
			// release the fetch lock
			fetchLock.Decrement();
		} else {
			// release the fetch lock
			fetchLock.Decrement();
			// another thread is fetching right now so consider stalled
			return ( result | RUN_STALLED );
		}

		// if at the end of the job list we're done
		if ( state.nextJobIndex >= jobList.Num() ) {
			return ( result | RUN_DONE );
		}

		// execute the next job
		{
			uint64 jobStart = JobTimer_Clock;

			jobList[state.nextJobIndex].function( jobList[state.nextJobIndex].data );
			jobList[state.nextJobIndex].executed = 1;

			uint64 jobEnd = JobTimer_Clock;
			deferredThreadStats.threadExecTime[threadNum] += jobEnd - jobStart;

#ifndef _DEBUG
			if ( jobs_longJobMicroSec.GetInteger() > 0 && GetId() != JOBLIST_UTILITY ) {
				uint64 durationUs = TimerClockToMicrosec( jobEnd - jobStart );
				if ( durationUs > jobs_longJobMicroSec.GetInteger() ) {
					longJobTime = durationUs * ( 1.0f / 1000.0f );
					longJobFunc = jobList[state.nextJobIndex].function;
					longJobData = jobList[state.nextJobIndex].data;
					const char * jobName = GetJobName( jobList[state.nextJobIndex].function );
					const char * jobListName = GetJobListName( GetId() );
					idLib::Printf( "%1.1f milliseconds for a single '%s' job from job list %s on thread %d\n", longJobTime, jobName, jobListName, threadNum );
				}
			}
#endif
		}

		result |= RUN_PROGRESS;

		// decrease the job count for the current signal
		if ( signalJobCount[state.signalIndex].Decrement() == 0 ) {
			// if this was the very last job of the job list
			if ( state.signalIndex == signalJobCount.Num() - 1 ) {
				deferredThreadStats.endTime = JobTimer_Clock;
				return ( result | RUN_DONE );
			}
		}

	} while( ! singleJob );

	return result;
}

/*
========================
idParallelJobList_Threads::RunJobs
========================
*/
int idParallelJobList_Threads::RunJobs( unsigned int threadNum, threadJobListState_t & state, bool singleJob ) {
	uint64 start = JobTimer_Clock;

	numThreadsExecuting.Increment();

	int result = RunJobsInternal( threadNum, state, singleJob );

	numThreadsExecuting.Decrement();

	deferredThreadStats.threadTotalTime[threadNum] += JobTimer_Clock - start;

	return result;
}

/*
========================
idParallelJobList_Threads::WaitForOtherJobList
========================
*/
bool idParallelJobList_Threads::WaitForOtherJobList() {
	if ( waitForGuard != NULL ) {
		if ( waitForGuard->GetValue() > 0 ) {
			return true;
		}
	}
	return false;
}

/*
================================================================================================

idParallelJobList

================================================================================================
*/

/*
========================
idParallelJobList::idParallelJobList
========================
*/
idParallelJobList::idParallelJobList( jobListId_t id, jobListPriority_t priority, unsigned int maxJobs, unsigned int maxSyncs, const idColor * color ) {
	assert( priority > JOBLIST_PRIORITY_NONE );
	this->jobListThreads = new idParallelJobList_Threads( id, priority, maxJobs, maxSyncs );
	this->color = color;
}

/*
========================
idParallelJobList::~idParallelJobList
========================
*/
idParallelJobList::~idParallelJobList() {
	delete jobListThreads;
}

/*
========================
idParallelJobList::AddJob
========================
*/
void idParallelJobList::AddJob( jobRun_t function, void * data ) {
	assert( IsRegisteredJob( function ) );
	jobListThreads->AddJob( function, data );
}

/*
========================
idParallelJobList::AddJobSPURS
========================
*/
CellSpursJob128 * idParallelJobList::AddJobSPURS() {
	return NULL;
}

/*
========================
idParallelJobList::InsertSyncPoint
========================
*/
void idParallelJobList::InsertSyncPoint( jobSyncType_t syncType ) {
	jobListThreads->InsertSyncPoint( syncType );
}

/*
========================
idParallelJobList::Wait
========================
*/
void idParallelJobList::Wait() {
	TRACE_CPU_SCOPE_COLOR( "ParallelJobList::Wait", TRACE_COLOR_IDLE )
	if ( jobListThreads != NULL ) {
		jobListThreads->Wait();
	}
}

/*
========================
idParallelJobList::TryWait
========================
*/
bool idParallelJobList::TryWait() {
	bool done = true;
	if ( jobListThreads != NULL ) {
		done &= jobListThreads->TryWait();
	}
	return done;
}

/*
========================
idParallelJobList::Submit
========================
*/
void idParallelJobList::Submit( idParallelJobList * waitForJobList, int parallelism ) {
	assert( waitForJobList != this );
	jobListThreads->Submit( ( waitForJobList != NULL ) ? waitForJobList->jobListThreads : NULL, parallelism );
}

/*
========================
idParallelJobList::IsSubmitted
========================
*/
bool idParallelJobList::IsSubmitted() const {
	return jobListThreads->IsSubmitted();
}

/*
========================
idParallelJobList::GetNumExecutedJobs
========================
*/
unsigned int idParallelJobList::GetNumExecutedJobs() const {
	return jobListThreads->GetNumExecutedJobs();
}

/*
========================
idParallelJobList::GetNumSyncs
========================
*/
unsigned int idParallelJobList::GetNumSyncs() const {
	return jobListThreads->GetNumSyncs();
}

/*
========================
idParallelJobList::GetSubmitTimeMicroSec
========================
*/
uint64 idParallelJobList::GetSubmitTimeMicroSec() const {
	return jobListThreads->GetSubmitTimeMicroSec();
}

/*
========================
idParallelJobList::GetStartTimeMicroSec
========================
*/
uint64 idParallelJobList::GetStartTimeMicroSec() const {
	return jobListThreads->GetStartTimeMicroSec();
}

/*
========================
idParallelJobList::GetFinishTimeMicroSec
========================
*/
uint64 idParallelJobList::GetFinishTimeMicroSec() const {
	return jobListThreads->GetFinishTimeMicroSec();
}

/*
========================
idParallelJobList::GetWaitTimeMicroSec
========================
*/
uint64 idParallelJobList::GetWaitTimeMicroSec() const {
	return jobListThreads->GetWaitTimeMicroSec();
}

/*
========================
idParallelJobList::GetTotalProcessingTimeMicroSec
========================
*/
uint64 idParallelJobList::GetTotalProcessingTimeMicroSec() const {
	return jobListThreads->GetTotalProcessingTimeMicroSec();
}

/*
========================
idParallelJobList::GetTotalWastedTimeMicroSec
========================
*/
uint64 idParallelJobList::GetTotalWastedTimeMicroSec() const {
	return jobListThreads->GetTotalWastedTimeMicroSec();
}

/*
========================
idParallelJobList::GetUnitProcessingTimeMicroSec
========================
*/
uint64 idParallelJobList::GetUnitProcessingTimeMicroSec( int unit ) const {
	return jobListThreads->GetUnitProcessingTimeMicroSec( unit );
}

/*
========================
idParallelJobList::GetUnitWastedTimeMicroSec
========================
*/
uint64 idParallelJobList::GetUnitWastedTimeMicroSec( int unit ) const {
	return jobListThreads->GetUnitWastedTimeMicroSec( unit );
}

/*
========================
idParallelJobList::GetId
========================
*/
jobListId_t idParallelJobList::GetId() const {
	return jobListThreads->GetId();
}

/*
================================================================================================

idJobThread

================================================================================================
*/

const int JOB_THREAD_STACK_SIZE		= 256 * 1024;	// same size as the SPU local store

struct threadJobList_t {
	idParallelJobList_Threads *	jobList;
	int							version;
};

static idCVar jobs_prioritize( "jobs_prioritize", "1", CVAR_BOOL | CVAR_NOCHEAT, "prioritize job lists" );

class idJobThread : public idSysThread {
public:
								idJobThread();
								~idJobThread();

	void						Start( core_t core, unsigned int threadNum );

	void						AddJobList( idParallelJobList_Threads * jobList );

private:
	threadJobList_t				jobLists[MAX_JOBLISTS];	// cyclic buffer with job lists
	unsigned int				firstJobList;			// index of the last job list the thread grabbed
	unsigned int				lastJobList;			// index where the next job list to work on will be added
	idSysMutex					addJobMutex;

	unsigned int				threadNum;

	virtual int					Run() override;
};

/*
========================
idJobThread::idJobThread
========================
*/
idJobThread::idJobThread() :
		firstJobList( 0 ),
		lastJobList( 0 ),
		threadNum( 0 ) {
}

/*
========================
idJobThread::~idJobThread
========================
*/
idJobThread::~idJobThread() {
}

/*
========================
idJobThread::Start
========================
*/
void idJobThread::Start( core_t core, unsigned int threadNum ) {
	this->threadNum = threadNum;
	StartWorkerThread( va( "JobListProcessor_%d", threadNum ), core, THREAD_NORMAL, JOB_THREAD_STACK_SIZE );
}

/*
========================
idJobThread::AddJobList
========================
*/
void idJobThread::AddJobList( idParallelJobList_Threads * jobList ) {
	// must lock because multiple threads may try to add new job lists at the same time
	addJobMutex.Lock();
	// wait until there is space available because in rare cases multiple versions of the same job lists may still be queued 
	while( lastJobList - firstJobList >= MAX_JOBLISTS ) {
		Sys_Yield();
	}
	assert( lastJobList - firstJobList < MAX_JOBLISTS );
	jobLists[lastJobList & ( MAX_JOBLISTS - 1 )].jobList = jobList;
	jobLists[lastJobList & ( MAX_JOBLISTS - 1 )].version = jobList->GetVersion();
	lastJobList++;
	addJobMutex.Unlock();
}

/*
========================
idJobThread::Run
========================
*/
int idJobThread::Run() {
	threadJobListState_t threadJobListState[MAX_JOBLISTS];
	int numJobLists = 0;
	int lastStalledJobList = -1;

	while ( !IsTerminating() ) {

		// fetch any new job lists and add them to the local list
		if ( numJobLists < MAX_JOBLISTS && firstJobList < lastJobList ) {
			threadJobListState[numJobLists].jobList = jobLists[firstJobList & ( MAX_JOBLISTS - 1 )].jobList;
			threadJobListState[numJobLists].version = jobLists[firstJobList & ( MAX_JOBLISTS - 1 )].version;
			threadJobListState[numJobLists].signalIndex = 0;
			threadJobListState[numJobLists].lastJobIndex = 0;
			threadJobListState[numJobLists].nextJobIndex = -1;
			numJobLists++;
			firstJobList++;
		}
		if ( numJobLists == 0 ) {
			break;
		}

		int currentJobList = 0;
		jobListPriority_t priority = JOBLIST_PRIORITY_NONE;
		if ( lastStalledJobList < 0 ) {
			// find the job list with the highest priority
			for ( int i = 0; i < numJobLists; i++ ) {
				if ( threadJobListState[i].jobList->GetPriority() > priority && !threadJobListState[i].jobList->WaitForOtherJobList() ) {
					priority = threadJobListState[i].jobList->GetPriority();
					currentJobList = i;
				}
			}
		} else {
			// try to hide the stall with a job from a list that has equal or higher priority
			currentJobList = lastStalledJobList;
			priority = threadJobListState[lastStalledJobList].jobList->GetPriority();
			for ( int i = 0; i < numJobLists; i++ ) {
				if ( i != lastStalledJobList && threadJobListState[i].jobList->GetPriority() >= priority && !threadJobListState[i].jobList->WaitForOtherJobList() ) {
					priority = threadJobListState[i].jobList->GetPriority();
					currentJobList = i;
				}
			}
		}

		// if the priority is high then try to run through the whole list to reduce the overhead
		// otherwise run a single job and re-evaluate priorities for the next job
		bool singleJob = ( priority == JOBLIST_PRIORITY_HIGH ) ? false : jobs_prioritize.GetBool();

		// try running one or more jobs from the current job list
		int result = threadJobListState[currentJobList].jobList->RunJobs( threadNum, threadJobListState[currentJobList], singleJob );

		if ( ( result & idParallelJobList_Threads::RUN_DONE ) != 0 ) {
			// done with this job list so remove it from the local list
			for ( int i = currentJobList; i < numJobLists - 1; i++ ) {
				threadJobListState[i] = threadJobListState[i + 1];
			}
			numJobLists--;
			lastStalledJobList = -1;
		} else if ( ( result & idParallelJobList_Threads::RUN_STALLED ) != 0 ) {
			// yield when stalled on the same job list again without making any progress
			if ( currentJobList == lastStalledJobList ) {
				if ( ( result & idParallelJobList_Threads::RUN_PROGRESS ) == 0 ) {
					Sys_Yield();
				}
			}
			lastStalledJobList = currentJobList;
		} else {
			lastStalledJobList = -1;
		}
	}
	return 0;
}

/*
================================================================================================

idParallelJobManagerLocal

================================================================================================
*/

extern void Sys_CPUCount( int & logicalNum, int & coreNum, int & packageNum );


// WINDOWS LOGICAL PROCESSOR LIMITS:
//
// http://download.microsoft.com/download/5/7/7/577a5684-8a83-43ae-9272-ff260a9c20e2/Hyper-thread_Windows.doc
//
//											Physical	Logical (Cores + HT)
// Windows XP Home Edition					1			2 
// Windows XP Professional					2			4 
// Windows Server 2003, Standard Edition	4			8 
// Windows Server 2003, Enterprise Edition	8			16 
// Windows Server 2003, Datacenter Edition	32			32
//
// Windows Vista							?			?
//
// Windows 7 Starter						1			32/64
// Windows 7 Home Basic						1			32/64
// Windows 7 Professional					2			32/64
//
//
// Hyperthreading is not dead yet.  Intel's Core i7 Processor is quad-core with HT for 8 logicals.

#define MAX_JOB_THREADS		32
#define JOB_THREAD_CORES	{	CORE_ANY, CORE_ANY, CORE_ANY, CORE_ANY,	\
								CORE_ANY, CORE_ANY, CORE_ANY, CORE_ANY,	\
								CORE_ANY, CORE_ANY, CORE_ANY, CORE_ANY,	\
								CORE_ANY, CORE_ANY, CORE_ANY, CORE_ANY,	\
								CORE_ANY, CORE_ANY, CORE_ANY, CORE_ANY,	\
								CORE_ANY, CORE_ANY, CORE_ANY, CORE_ANY,	\
								CORE_ANY, CORE_ANY, CORE_ANY, CORE_ANY,	\
								CORE_ANY, CORE_ANY, CORE_ANY, CORE_ANY }


idCVar jobs_numThreadsRealtime(
	"jobs_numThreadsRealtime", "-1", CVAR_INTEGER | CVAR_NOCHEAT | CVAR_ARCHIVE,
	"Number of threads used by job system during gameplay (REALTIME joblists). "
	"-1 means it is determined automatically.",
-1, MAX_JOB_THREADS );
idCVar jobs_offsetThreadsRealtime(
	"jobs_offsetThreadsRealtime", "0", CVAR_INTEGER | CVAR_NOCHEAT | CVAR_ARCHIVE,
	"Added to automatically computed number of threads for jobs system during gameplay. "
	"Only takes effect if jobs_numThreadsRealtime is set to auto. "
	"For instance: set -1 if you want to leave one free thread for graphics driver.",
-MAX_JOB_THREADS, MAX_JOB_THREADS );
idCVar jobs_numThreadsNonInteractive(
	"jobs_numThreadsNonInteractive", "-1", CVAR_INTEGER | CVAR_NOCHEAT | CVAR_ARCHIVE,
	"Number of threads used by job system for non-interactive tasks (NONINTERACTIVE joblists). "
	"-1 means it is determined automatically.",
-1, MAX_JOB_THREADS );
idCVar jobs_maxHddThreads(
	"jobs_maxHddThreads", "1", CVAR_INTEGER | CVAR_NOCHEAT | CVAR_ARCHIVE,
	"Maximum number of threads used for jobs that read from disk if game is on Hard Disk Drive. "
	"HDDs are slow-seeking devices, parallel reads to them makes things slower, not faster.",
0, MAX_JOB_THREADS );

class idParallelJobManagerLocal : public idParallelJobManager {
public:
	virtual						~idParallelJobManagerLocal() override {}

	virtual void				Init() override;
	virtual void				Shutdown() override;

	virtual idParallelJobList *	AllocJobList( jobListId_t id, jobListPriority_t priority, unsigned int maxJobs, unsigned int maxSyncs, const idColor * color ) override;
	virtual void				FreeJobList( idParallelJobList * jobList ) override;

	virtual int					GetNumJobLists() const override;
	virtual int					GetNumFreeJobLists() const override;
	virtual idParallelJobList *	GetJobList( int index ) override;

	virtual int					GetNumProcessingUnits( int parallelism = JOBLIST_PARALLELISM_REALTIME ) override;

	virtual void				WaitForAllJobLists() override;

	void						Submit( idParallelJobList_Threads * jobList, int parallelism );

private:
	idJobThread						threads[MAX_JOB_THREADS];
	int								maxThreads;					// how many OS threads are spawned currently
	idStaticList< idParallelJobList *, MAX_JOBLISTS >	jobLists;

	// information about hardware:
	int								numPhysicalCpuCores;
	int								numLogicalCpuCores;
	int								numCpuPackages;
	bool							isRunningOnHdd;

	void							ChangePhysicalThreadsCount(int wantPhysicalThreads);
};

idParallelJobManagerLocal parallelJobManagerLocal;
idParallelJobManager * parallelJobManager = &parallelJobManagerLocal;

/*
========================
SubmitJobList
========================
*/
void SubmitJobList( idParallelJobList_Threads * jobList, int parallelism ) {
	parallelJobManagerLocal.Submit( jobList, parallelism );
}

void idParallelJobManagerLocal::ChangePhysicalThreadsCount(int wantPhysicalThreads) {
	// on consoles this will have specific cores for the threads, but on PC they will all be CORE_ANY
	core_t cores[] = JOB_THREAD_CORES;
	wantPhysicalThreads = idMath::ClampInt( 0, MAX_JOB_THREADS, wantPhysicalThreads );
	while (maxThreads < wantPhysicalThreads) {
		int idx = maxThreads++;
		threads[idx].Start( cores[idx], idx );
	}
	while (maxThreads > wantPhysicalThreads) {
		int idx = --maxThreads;
		threads[idx].StopThread( false );
	}
}

/*
========================
idParallelJobManagerLocal::Init
========================
*/
void idParallelJobManagerLocal::Init() {
	Sys_CPUCount( numLogicalCpuCores, numPhysicalCpuCores, numCpuPackages );
	idLib::Printf("CPU core count: %d physical, %d logical\n", numPhysicalCpuCores, numLogicalCpuCores);

	isRunningOnHdd = Sys_IsFileOnHdd( Sys_EXEPath() );
	if (isRunningOnHdd) {
		idLib::Printf("HDD detected: number of loading threads limited\n");
	} else {
		idLib::Printf("HDD not detected: loading threads unlimited\n");
	}

	maxThreads = 0;
	assert(numLogicalCpuCores >= 0);
	// note: some of these threads can run idle, not consuming CPU resources
	// idParallelJobManagerLocal::Submit runs joblist onto first K threads, where K is configured
	ChangePhysicalThreadsCount(numLogicalCpuCores);
}

/*
========================
idParallelJobManagerLocal::Shutdown
========================
*/
void idParallelJobManagerLocal::Shutdown() {
	for ( int i = 0; i < maxThreads; i++ ) {
		threads[i].StopThread();
	}
	maxThreads = 0;
}

/*
========================
idParallelJobManagerLocal::AllocJobList
========================
*/
idParallelJobList * idParallelJobManagerLocal::AllocJobList( jobListId_t id, jobListPriority_t priority, unsigned int maxJobs, unsigned int maxSyncs, const idColor * color ) {
	for ( int i = 0; i < jobLists.Num(); i++ ) {
		if ( jobLists[i]->GetId() == id ) {
			// idStudio may cause job lists to be allocated multiple times
			assert( false );
			// stgatilov: renderer joblists are persistent
			// utility joblists are not, but we should finish old one beforehand
		}
	}
	idParallelJobList * jobList = new idParallelJobList( id, priority, maxJobs, maxSyncs, color );
	jobLists.Append( jobList );
	return jobList;
}

/*
========================
idParallelJobManagerLocal::FreeJobList
========================
*/
void idParallelJobManagerLocal::FreeJobList( idParallelJobList * jobList ) {
	if ( jobList == NULL ) {
		return;
	}
	// wait for all job threads to finish because job list deletion is not thread safe
	for ( int i = 0; i < maxThreads; i++ ) {
		threads[i].WaitForThread();
	}
	int index = jobLists.FindIndex( jobList );
	assert( index >= 0 && jobLists[index] == jobList );
	jobLists[index]->Wait();
	delete jobLists[index];
	jobLists.RemoveIndexFast( index );
}

/*
========================
idParallelJobManagerLocal::GetNumJobLists
========================
*/
int idParallelJobManagerLocal::GetNumJobLists() const {
	return jobLists.Num();
}

/*
========================
idParallelJobManagerLocal::GetNumFreeJobLists
========================
*/
int idParallelJobManagerLocal::GetNumFreeJobLists() const {
	return MAX_JOBLISTS - jobLists.Num();
}

/*
========================
idParallelJobManagerLocal::GetJobList
========================
*/
idParallelJobList * idParallelJobManagerLocal::GetJobList( int index ) {
	return jobLists[index];
}

/*
========================
idParallelJobManagerLocal::WaitForAllJobLists
========================
*/
void idParallelJobManagerLocal::WaitForAllJobLists() {
	// wait for all job lists to complete
	for ( int i = 0; i < jobLists.Num(); i++ ) {
		jobLists[i]->Wait();
	}
}

/*
========================
idParallelJobManagerLocal::GetNumProcessingUnits
========================
*/
int idParallelJobManagerLocal::GetNumProcessingUnits( int parallelism ) {
	bool disk = (parallelism & JOBLIST_PARALLELISM_FLAG_DISK) != 0;
	parallelism &= ~JOBLIST_PARALLELISM_FLAG_DISK;

	// determine the number of threads to use
	int numThreads;
	if ( parallelism == JOBLIST_PARALLELISM_REALTIME ) {
		// usually we have frontend and backend on dedicated threads (depending on com_smp)
		int dedicatedOccupiedThreads = 1 + session->IsFrontendThreadUsed();
		// we assume that calling thread will wait immediately and free its core for one of the worker threads
		numThreads = numPhysicalCpuCores - dedicatedOccupiedThreads + 1;
		// leave a few cores for OS background tasks, and maybe for OpenGL driver since they often do work in separate thread
		numThreads += jobs_offsetThreadsRealtime.GetInteger();

		if ( jobs_numThreadsRealtime.GetInteger() >= 0 )
			numThreads = jobs_numThreadsRealtime.GetInteger();
	}
	else if ( parallelism == JOBLIST_PARALLELISM_NONINTERACTIVE ) {
		numThreads = numLogicalCpuCores;

		if ( jobs_numThreadsNonInteractive.GetInteger() >= 0 )
			numThreads = jobs_numThreadsNonInteractive.GetInteger();
	}
	else if ( parallelism == JOBLIST_PARALLELISM_NONE ) {
		numThreads = 0;
	}
	else {
		idLib::Warning("Explicit number of threads %d used for testing", parallelism);
		numThreads = parallelism;
	}

	// can't run more threads than we physically have
	// note: we should have spawned enough threads at game start
	numThreads = idMath::Imin(numThreads, maxThreads);

	if (disk && isRunningOnHdd) {
		numThreads = idMath::Imin(numThreads, jobs_maxHddThreads.GetInteger());
	}

	return numThreads;
}

/*
========================
idParallelJobManagerLocal::Submit
========================
*/
void idParallelJobManagerLocal::Submit( idParallelJobList_Threads * jobList, int parallelism ) {
	/*if ( jobs_numThreads.IsModified() ) {
		ChangePhysicalThreadsCount();
		jobs_numThreads.ClearModified();
	}*/

	int numThreads = GetNumProcessingUnits( parallelism );

	if ( numThreads <= 0 ) {
		threadJobListState_t state( jobList->GetVersion() );
		jobList->RunJobs( 0, state, false );
		return;
	}

	for ( int i = 0; i < numThreads; i++ ) {
		threads[i].AddJobList( jobList );
		threads[i].SignalWork();
	}
}