Added MontaCarlo project

4 years ago · 9cb7f2abd4
8 changed files with 13638 additions and 0 deletions
--- a/MonteCarlo/Common.h
+++ b/MonteCarlo/Common.h
@ -0,0 +1,149 @@
+#pragma once
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cl.hpp"
+
+#pragma warning( disable : 4996 )
+
+void WriteTGA_RGB(const char* filename, unsigned char* data, unsigned int width, unsigned int height)
+{
+	FILE *f = fopen(filename, "wb");
+	if (!f) {
+		fprintf(stderr, "Unable to create output TGA image `%s'\n", filename);
+		exit(EXIT_FAILURE);
+	}
+
+	fputc(0x00, f); /* ID Length, 0 => No ID        */
+	fputc(0x00, f); /* Color Map Type, 0 => No color map included   */
+	fputc(0x02, f); /* Image Type, 2 => Uncompressed, True-color Image */
+	fputc(0x00, f); /* Next five bytes are about the color map entries */
+	fputc(0x00, f); /* 2 bytes Index, 2 bytes length, 1 byte size */
+	fputc(0x00, f);
+	fputc(0x00, f);
+	fputc(0x00, f);
+	fputc(0x00, f); /* X-origin of Image    */
+	fputc(0x00, f);
+	fputc(0x00, f); /* Y-origin of Image    */
+	fputc(0x00, f);
+	fputc(width & 0xff, f); /* Image Width      */
+	fputc((width >> 8) & 0xff, f);
+	fputc(height & 0xff, f); /* Image Height     */
+	fputc((height >> 8) & 0xff, f);
+	fputc(0x18, f); /* Pixel Depth, 0x18 => 24 Bits */
+	fputc(0x20, f); /* Image Descriptor     */
+
+	for (int y = height - 1; y >= 0; y--) {
+		for (size_t x = 0; x < width; x++) {
+			const size_t i = (y * width + x) * 3;
+			fputc(data[i + 2], f); /* write blue */
+			fputc(data[i + 1], f); /* write green */
+			fputc(data[i], f); /* write red */
+		}
+	}
+}
+
+std::string FileToString(const std::string& path) {
+	std::ifstream file(path, std::ios::in | std::ios::binary);
+	if (file)
+	{
+		std::ostringstream contents;
+		contents << file.rdbuf();
+		file.close();
+		return(contents.str());
+	}
+	return std::string();
+	
+}
+
+const char *getErrorString(cl_int error)
+{
+	switch (error) {
+		// run-time and JIT compiler errors
+	case 0: return "CL_SUCCESS";
+	case -1: return "CL_DEVICE_NOT_FOUND";
+	case -2: return "CL_DEVICE_NOT_AVAILABLE";
+	case -3: return "CL_COMPILER_NOT_AVAILABLE";
+	case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+	case -5: return "CL_OUT_OF_RESOURCES";
+	case -6: return "CL_OUT_OF_HOST_MEMORY";
+	case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+	case -8: return "CL_MEM_COPY_OVERLAP";
+	case -9: return "CL_IMAGE_FORMAT_MISMATCH";
+	case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+	case -11: return "CL_BUILD_PROGRAM_FAILURE";
+	case -12: return "CL_MAP_FAILURE";
+	case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+	case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+	case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+	case -16: return "CL_LINKER_NOT_AVAILABLE";
+	case -17: return "CL_LINK_PROGRAM_FAILURE";
+	case -18: return "CL_DEVICE_PARTITION_FAILED";
+	case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+
+		// compile-time errors
+	case -30: return "CL_INVALID_VALUE";
+	case -31: return "CL_INVALID_DEVICE_TYPE";
+	case -32: return "CL_INVALID_PLATFORM";
+	case -33: return "CL_INVALID_DEVICE";
+	case -34: return "CL_INVALID_CONTEXT";
+	case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+	case -36: return "CL_INVALID_COMMAND_QUEUE";
+	case -37: return "CL_INVALID_HOST_PTR";
+	case -38: return "CL_INVALID_MEM_OBJECT";
+	case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+	case -40: return "CL_INVALID_IMAGE_SIZE";
+	case -41: return "CL_INVALID_SAMPLER";
+	case -42: return "CL_INVALID_BINARY";
+	case -43: return "CL_INVALID_BUILD_OPTIONS";
+	case -44: return "CL_INVALID_PROGRAM";
+	case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+	case -46: return "CL_INVALID_KERNEL_NAME";
+	case -47: return "CL_INVALID_KERNEL_DEFINITION";
+	case -48: return "CL_INVALID_KERNEL";
+	case -49: return "CL_INVALID_ARG_INDEX";
+	case -50: return "CL_INVALID_ARG_VALUE";
+	case -51: return "CL_INVALID_ARG_SIZE";
+	case -52: return "CL_INVALID_KERNEL_ARGS";
+	case -53: return "CL_INVALID_WORK_DIMENSION";
+	case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+	case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+	case -56: return "CL_INVALID_GLOBAL_OFFSET";
+	case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+	case -58: return "CL_INVALID_EVENT";
+	case -59: return "CL_INVALID_OPERATION";
+	case -60: return "CL_INVALID_GL_OBJECT";
+	case -61: return "CL_INVALID_BUFFER_SIZE";
+	case -62: return "CL_INVALID_MIP_LEVEL";
+	case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+	case -64: return "CL_INVALID_PROPERTY";
+	case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+	case -66: return "CL_INVALID_COMPILER_OPTIONS";
+	case -67: return "CL_INVALID_LINKER_OPTIONS";
+	case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+
+		// extension errors
+	case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+	case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
+	case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
+	case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
+	case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
+	case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
+	default: return "Unknown OpenCL error";
+	}
+}
+
+bool CheckCLError(cl_int err)
+{
+	if(err != CL_SUCCESS)
+	{
+		std::cout << "OpenCL error: " << getErrorString(err) << std::endl;
+		return false;
+	}
+
+	return true;
+}
--- a/MonteCarlo/MonteCarlo.cpp
+++ b/MonteCarlo/MonteCarlo.cpp
@ -0,0 +1,200 @@
+// MonteCarlo.cpp : Defines the entry point for the console application.
+
+#include <string>
+#include <vector>
+#include "Common.h"
+
+// OpenCL C API
+#include <CL/opencl.h>
+
+// OpenCL C++ API
+#include "cl.hpp"
+
+const bool writeOutRandoms = true;
+const size_t randomNumbers = 1024;
+const size_t threadCount = 512;
+
+void capi()
+{
+	// Get a platform ID
+	cl_platform_id platformID;
+	clGetPlatformIDs(1, &platformID, NULL);
+	
+	// Get a device ID
+	cl_device_id deviceID;
+	clGetDeviceIDs(platformID, CL_DEVICE_TYPE_GPU, 1, &deviceID, NULL);
+	
+	// Create a context
+	cl_context context;
+	cl_context_properties contextProperties[] =
+	{ CL_CONTEXT_PLATFORM, (cl_context_properties)platformID, 0 };
+	context = clCreateContext(contextProperties, 1, &deviceID, NULL, NULL, NULL);
+	
+	// Create a command queue
+	cl_command_queue queue;
+	queue = clCreateCommandQueue(context, deviceID, CL_QUEUE_PROFILING_ENABLE, NULL);
+	
+	// Create an OpenCL program
+	std::string source = FileToString("../kernels/programs.cl");
+	const char* csource = source.c_str();
+	cl_program program = clCreateProgramWithSource(context, 1, &csource, NULL, NULL);
+	cl_int err = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		cl_uint logLength;
+		clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength);
+		char* log = new char[logLength];
+		clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, logLength, log, 0);
+		std::cout << log << std::endl;
+		delete[] log;
+		exit(-1);
+	}
+
+	// Get the kernel handle
+	cl_kernel kernel = clCreateKernel(program, "randomLCG", &err);
+	if(!CheckCLError(err)) exit(-1);
+
+	// Allocate memory for random numbers and random seeds
+	// Every thread receives a private seed, stored in seedBuffer/clSeedBuffer
+	// Every thread is supposed to generate randomNumbers random numbers
+	std::vector<float> randomBuffer(threadCount * randomNumbers);
+	std::vector<float> seedBuffer(threadCount);
+	for (int i = 0; i < threadCount; ++i)
+	{
+		seedBuffer[i] = rand();
+	}
+
+	cl_mem randomBufferDev;
+	randomBufferDev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)* threadCount * randomNumbers, NULL, &err);
+	if (!CheckCLError(err)) exit(-1);
+	cl_mem seedBufferDev;
+	seedBufferDev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * threadCount, NULL, &err);
+	if (!CheckCLError(err)) exit(-1);
+
+	clEnqueueWriteBuffer(queue, seedBufferDev, CL_TRUE, 0, sizeof(float) * threadCount, seedBuffer.data(), 0, NULL, NULL);
+
+	// Set the kernel paramateres
+	clSetKernelArg(kernel, 0, sizeof(int), &randomNumbers);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), &seedBufferDev);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), &randomBufferDev);
+	// Enqueue the kernel: threadCount threads in total, each generating randomNumbers random numbers in [0,1]
+	clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &threadCount, NULL, 0, NULL, NULL);
+	// Copy the result back to the host
+	clEnqueueReadBuffer(queue, randomBufferDev, CL_TRUE, 0, sizeof(float) * threadCount * randomNumbers, randomBuffer.data(), 0, NULL, NULL);
+
+	// Write out the output
+	if(writeOutRandoms == true)
+	{
+		std::ofstream ofs("randoms.txt", std::ofstream::out);
+		for (int i = 0; i < threadCount; ++i) {
+			for (int j = 0; j < randomNumbers; ++j) {
+				ofs << j << " " << randomBuffer[i + j * threadCount] << std::endl;
+			}
+			ofs << std::endl;
+		}
+		ofs.close();
+	}
+
+	std::cout << "Finished" << std::endl;
+}
+
+void cppapi()
+{
+	cl_int err = CL_SUCCESS;
+
+	// Get a platform ID
+	std::vector<cl::Platform> platforms;
+	cl::Platform::get(&platforms);
+	if (platforms.size() == 0)
+	{
+		std::cout << "Unable to find suitable platform." << std::endl;
+		exit(-1);
+	}
+
+	// Create a context
+	cl_context_properties properties[] =
+	{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 };
+	cl::Context context(CL_DEVICE_TYPE_GPU, properties);
+
+	// Enumerate the devices
+	std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+
+	// Create the command queue
+	cl::Event event;
+	cl::CommandQueue queue(context, devices[0], 0, &err);
+
+	// Create the OpenCL program
+	std::string programSource = FileToString("../kernels/programs.cl");
+	cl::Program program = cl::Program(context, programSource);
+	err = program.build(devices);
+	if (!CheckCLError(err))
+	{
+		for (size_t devID = 0; devID < devices.size(); ++devID)
+		{
+			std::cout << "Device: " << devID << std::endl;
+			std::cout << "Build Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(devices[devID]) << std::endl;
+			std::cout << "Build Options:\t" << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(devices[devID]) << std::endl;
+			std::cout << "Build Log:\t " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[devID]) << std::endl;
+			std::cout << "--------------------------------------------------" << std::endl;
+		}
+	}
+	
+
+	// Get the kernel handle
+	cl::Kernel kernel(program, "randomLCG", &err);
+	CheckCLError(err);
+
+	// Allocate memory for random numbers and random seeds
+	// Every thread receives a private seed, stored in seedBuffer/clSeedBuffer
+	// Every thread is supposed to generate randomNumbers random numbers
+	std::vector<float> randomBuffer(threadCount * randomNumbers);
+	std::vector<float> seedBuffer(threadCount);
+	for (int i = 0; i < threadCount; ++i)
+	{
+		seedBuffer[i] = rand();
+	}
+	
+	cl::Buffer clSeedBuffer = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(float) * threadCount, NULL, &err);
+	queue.enqueueWriteBuffer(clSeedBuffer, true, 0, sizeof(float) * threadCount, seedBuffer.data());
+
+	cl::Buffer clRandomBuffer = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * threadCount * randomNumbers, NULL, &err);
+
+	// Set the kernel parameters	
+	kernel.setArg(0, randomNumbers);
+	kernel.setArg(1, clSeedBuffer);
+	kernel.setArg(2, clRandomBuffer);
+
+	// Enqueue the kernel: threadCount threads in total, each generating random numbers in [0,1] randomNumbers times
+	queue.enqueueNDRangeKernel(kernel,
+		cl::NullRange,
+		cl::NDRange(threadCount, 1),
+		cl::NullRange,
+		NULL,
+		&event);
+	event.wait();
+
+	// Copy result back to host
+	queue.enqueueReadBuffer(clRandomBuffer, true, 0, sizeof(float) * threadCount * randomNumbers, randomBuffer.data());
+
+	// Write out the output to file as "index value" rows
+	if(writeOutRandoms == true)
+	{
+		std::ofstream ofs("randoms2.txt", std::ofstream::out);
+		for (int i = 0; i < threadCount; ++i) {
+			for (int j = 0; j < randomNumbers; ++j) {
+				ofs << j << " " << randomBuffer[i + j * threadCount] << std::endl;
+			}
+		}
+		ofs.close();
+	}
+	
+	std::cout << "Finished" << std::endl;
+}
+
+int main()
+{
+	capi();
+	cppapi();
+	return 0;
+}
+
--- a/MonteCarlo/MonteCarlo.vcxproj
+++ b/MonteCarlo/MonteCarlo.vcxproj
@ -0,0 +1,87 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{4B53AC9F-C842-4E17-93EB-1362CFD08104}</ProjectGuid>
+    <RootNamespace>MonteCarlo</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v100</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>$(SolutionDir)bin\</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>$(CUDA_PATH)\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32\</AdditionalLibraryDirectories>
+      <AdditionalDependencies>OpenCL.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>$(CUDA_PATH)\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32\</AdditionalLibraryDirectories>
+      <AdditionalDependencies>OpenCL.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="cl.hpp" />
+    <ClInclude Include="Common.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="MonteCarlo.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\kernels\mersenneTwister.cl" />
+    <None Include="..\kernels\montecarlo.cl" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/MonteCarlo/MonteCarlo.vcxproj.filters
+++ b/MonteCarlo/MonteCarlo.vcxproj.filters
@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+    <Filter Include="Kernels">
+      <UniqueIdentifier>{fd13ccc5-a98b-4e30-9eba-12f62c7dd566}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="cl.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="Common.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="MonteCarlo.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\kernels\montecarlo.cl">
+      <Filter>Kernels</Filter>
+    </None>
+    <None Include="..\kernels\mersenneTwister.cl">
+      <Filter>Kernels</Filter>
+    </None>
+  </ItemGroup>
+</Project>
--- a/MonteCarlo/cl.hpp
+++ b/MonteCarlo/cl.hpp
--- a/kernels/mersenneTwister.cl
+++ b/kernels/mersenneTwister.cl
@ -0,0 +1,75 @@
+/* -*- mode: c++ -*- */
+
+/*
+
+  Some parts are derived from NVIDIA's MersenneTwister SDK example.
+
+ */
+
+typedef struct{
+  unsigned int matrix_a;
+  unsigned int mask_b;
+  unsigned int mask_c;
+  unsigned int seed;
+} mt_struct_stripped;
+
+#define   MT_RNG_COUNT 4096
+#define          MT_MM 9
+#define          MT_NN 19
+#define       MT_WMASK 0xFFFFFFFFU
+#define       MT_UMASK 0xFFFFFFFEU
+#define       MT_LMASK 0x1U
+#define      MT_SHIFT0 12
+#define      MT_SHIFTB 7
+#define      MT_SHIFTC 15
+#define      MT_SHIFT1 18
+#define PI 3.14159265358979f
+
+__kernel void MersenneTwister(__global float* d_Rand,
+                              __global mt_struct_stripped* d_MT,
+                              int nPerRng)
+{
+    int globalID = get_global_id(0);
+
+    int iState, iState1, iStateM, iOut;
+    unsigned int mti, mti1, mtiM, x;
+    unsigned int mt[MT_NN], matrix_a, mask_b, mask_c;
+
+    //Load bit-vector Mersenne Twister parameters
+    matrix_a = d_MT[globalID].matrix_a;
+    mask_b   = d_MT[globalID].mask_b;
+    mask_c   = d_MT[globalID].mask_c;
+
+    //Initialize current state
+    mt[0] = d_MT[globalID].seed;
+    for (iState = 1; iState < MT_NN; iState++)
+        mt[iState] = (1812433253U * (mt[iState - 1] ^ (mt[iState - 1] >> 30)) + iState) & MT_WMASK;
+
+    iState = 0;
+    mti1 = mt[0];
+    for (iOut = 0; iOut < nPerRng; iOut++) {
+        iState1 = iState + 1;
+        iStateM = iState + MT_MM;
+        if(iState1 >= MT_NN) iState1 -= MT_NN;
+        if(iStateM >= MT_NN) iStateM -= MT_NN;
+        mti  = mti1;
+        mti1 = mt[iState1];
+        mtiM = mt[iStateM];
+
+            // MT recurrence
+        x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
+            x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
+
+        mt[iState] = x;
+        iState = iState1;
+
+        //Tempering transformation
+        x ^= (x >> MT_SHIFT0);
+        x ^= (x << MT_SHIFTB) & mask_b;
+        x ^= (x << MT_SHIFTC) & mask_c;
+        x ^= (x >> MT_SHIFT1);
+
+        //Convert to (0, 1] float and write to global memory
+        d_Rand[globalID + iOut * MT_RNG_COUNT] = ((float)x + 1.0f) / 4294967296.0f;
+    }
+}
--- a/kernels/mersenneTwister.dat
+++ b/kernels/mersenneTwister.dat
--- a/kernels/montecarlo.cl
+++ b/kernels/montecarlo.cl
@ -0,0 +1,150 @@
+/* -*- mode: c++ -*- */
+
+// Linear Congruential Generator
+uint stepLCG(uint *z, uint A, uint C){
+  return (*z) = (A * (*z) + C);
+}
+
+__kernel
+void randomLCG(const int randomNumbers, __global float* randomsSeed, __global float* randomGPU){
+  int id = get_global_id(0);
+  int maxID = get_global_size(0);
+
+  uint rng = randomsSeed[id];
+  for(int i=0; i < randomNumbers; ++i){
+    randomGPU[id + i * maxID] =  (float)stepLCG(&rng, 1664525, 1013904223UL) / 0xffffffff;
+  }
+}
+
+uint stepLFG(uint *z, __global uint *znmk, uint A, uint C){
+  return (*znmk) = (*z) = (A * (*z) + C) + (*znmk);
+}
+
+// Lagged Fibonacci Generator
+__kernel
+void randomLFG(const int randomNumbers, __global float* randomsSeed, const int randomStateSize, __global uint* randomState, __global float* randomGPU){
+  int id = get_global_id(0);
+  int maxID = get_global_size(0);
+
+  // bootstrap
+  uint rng = randomsSeed[id];
+  for(int i=0; i < randomStateSize; ++i){
+    randomState[id + i * maxID] =  stepLCG(&rng, 1664525, 1013904223UL);
+  }
+
+  // Lagged Fibonacci Generator
+  int nmkIndex = 0;
+  for(int i=0; i < randomNumbers; ++i){
+    randomGPU[id + i * maxID] = (float)stepLFG(&rng, &randomState[nmkIndex], 1664525, 1013904223UL) / 0xffffffff;
+    nmkIndex = (nmkIndex + 1) % randomStateSize;
+  }
+
+}
+
+// Combined Tausworthe Generator
+uint stepCTG(uint *z, uint S1, uint S2, uint S3, uint M){
+  uint b=((((*z)<<S1)^(*z))>>S2);
+  return (*z) = ((((*z)&M)<<S3)^b);
+}
+
+__kernel
+void randomCTG(const int randomNumbers, __global float2* randomsSeed, __global float* randomGPU){
+  int id = get_global_id(0);
+  int maxID = get_global_size(0);
+
+  uint rng1 = randomsSeed[id].x;
+  uint rng2 = randomsSeed[id].y;
+  for(int i=0; i < randomNumbers; ++i){
+    randomGPU[id + i * maxID] =  (float)(stepCTG(&rng1, 13, 19, 12, 4294967294UL) ^
+					 stepCTG(&rng2, 2, 25, 4, 4294967288UL)) / 0xffffffff;
+  }
+}
+
+// Hybrid RNG
+float stepHybrid(uint* rng1, uint* rng2, uint* rng3, uint* rng4){
+  return 2.3283064365387e-10 * (
+				stepCTG(rng1, 13, 19, 12, 4294967294UL) ^
+				stepCTG(rng2, 2, 25, 4, 4294967288UL) ^
+				stepCTG(rng3, 3, 11, 17, 4294967280UL) ^
+				stepLCG(rng4,1664525,1013904223UL)
+				);
+}
+
+__kernel
+void hybridRNG(const int randomNumbers, __global float* randomsSeed, __global float* randomGPU){
+  int id = get_global_id(0);
+  int maxID = get_global_size(0);
+
+  uint rng1 = randomsSeed[id * 4 + 0];
+  uint rng2 = randomsSeed[id * 4 + 1];
+  uint rng3 = randomsSeed[id * 4 + 2];
+  uint rng4 = randomsSeed[id * 4 + 3];
+
+  for(int i = 0; i < randomNumbers; ++i){
+    randomGPU[id + i * maxID] =  (float)stepHybrid(&rng1, &rng2, &rng3, &rng4);
+  }
+}
+
+// Halton sequence
+float stepHalton(float *value, float inv_base){
+  float r = 1.0 - (*value) - 0.0000000001;
+  if(inv_base < r) {
+    (*value) += inv_base;
+  } else {
+    float h = inv_base, hh;
+    do{
+      hh = h;
+      h *= inv_base;
+    } while (h >= r);
+    (*value) += hh + h - 1.0;
+  }
+  return (*value);
+}
+
+void seedHalton(ulong i, int base, float* inv_base, float* value){
+  float f = (*inv_base) = 1.0/base;
+  (*value) = 0.0;
+  while( i > 0){
+    (*value) += f * (float)(i % base);
+    i /= base;
+    f *= (*inv_base);
+  }
+}
+
+__kernel
+void haltonSequence(const int randomNumbers, const int base, __global float* randomGPU){
+  int id = get_global_id(0);
+  int maxID = get_global_size(0);
+
+  float inv_base = 0.0;
+  float rng = 0.0;
+  seedHalton(id * randomNumbers, base, &inv_base, &rng);
+
+  for(int i=0; i < randomNumbers; ++i){
+    randomGPU[id + i * maxID] =  stepHalton(&rng, inv_base);
+  }
+}
+
+// 1D uniformity test
+// TODO
+// Generate a quantized histogram
+// randomNums = number of randoms per thread
+// randoms    = array of random numbers
+// bucketNum  = number of histogram buckets
+// buckets    = array of histogram buckets
+__kernel
+void testUniform1D(const int randomNums, __global float* randoms, const int bucketNum, __global int* buckets){
+  
+}
+
+// 1D Monte-Carlo integral
+// TODO
+// Implement a Monte Carlo integrator: sin(x) ; x := [0:PI/2]
+// sampleNumber = number of samples per thread
+// seed         = float4 seed array for the random number generator
+// integral     = partial integral
+#define M_PIP2 1.57796327f
+__kernel
+void mcInt1D(const int sampleNumber, __global float4* seed, __global float* integral){
+  
+}