HPXCL: build_kernel_from

Example for for building a kernel from a file
// Copyright (c)       2015 Patrick Diehl
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#include <hpx/hpx_main.hpp>
#include <hpx/include/iostreams.hpp>
#include <hpx/lcos/future.hpp>

#include <hpxcl/cuda.hpp>

#include <unistd.h>

#define SIZE 100000

using namespace hpx::cuda;

// hpx_main, is the actual main called by hpx
int main(int argc, char* argv[]) {

        //Vector for all futures for the data management
        std::vector<hpx::lcos::future<void>> data_futures;

        // Get list of available Cuda Devices.
        std::vector<device> devices = get_all_devices(2, 0).get();

        // Check whether there are any devices
        if (devices.size() < 1) {
                hpx::cerr << "No CUDA devices found!" << hpx::endl;
                return hpx::finalize();
        }

        // Generate Input data
        unsigned int* inputData;
        cudaMallocHost((void**)&inputData, sizeof(unsigned int)*SIZE);

        // Create a device component from the first device found
        device cudaDevice = devices[0];

        for (unsigned int i = 0; i < SIZE; i++)
        inputData[i] = 1;

        // Create a buffer
        buffer outbuffer = cudaDevice.create_buffer_sync(SIZE * sizeof(unsigned int));

        // Copy input data to the buffer
        data_futures.push_back(outbuffer.enqueue_write(0, SIZE * sizeof(unsigned int), inputData));

        // Create the hello_world device program
        program prog = cudaDevice.create_program_with_file("kernel.cu").get();

        // Add compiler flags for compiling the kernel

        std::vector<std::string> flags;
        std::string mode = "--gpu-architecture=compute_";
        mode.append(
        std::to_string(cudaDevice.get_device_architecture_major().get()));

        mode.append(
        std::to_string(cudaDevice.get_device_architecture_minor().get()));

        flags.push_back(mode);

        // Compile the program
        prog.build_sync(flags,"sum");

        // Create the buffer for the result
        unsigned int* result;
        cudaMallocHost((void**)&result,sizeof(unsigned int));
        result[0] = 0;
        buffer resbuffer = cudaDevice.create_buffer_sync(sizeof(unsigned int));
        data_futures.push_back(resbuffer.enqueue_write(0,sizeof(unsigned int), result));

        //Create the buffer for the length of the array
        unsigned int* n;
        cudaMallocHost((void**)&n,sizeof(unsigned int));
        result[0] = SIZE;
        buffer lengthbuffer = cudaDevice.create_buffer_sync(sizeof(unsigned int));
        data_futures.push_back(lengthbuffer.enqueue_write(0,sizeof(unsigned int), n));

        //Generate the grid and block dim
        hpx::cuda::server::program::Dim3 grid;
        hpx::cuda::server::program::Dim3 block;

        //Set the values for the grid dimension
        grid.x = 1;
        grid.y = 1;
        grid.z = 1;

        //Set the values for the block dimension
        block.x = 32;
        block.y = 1;
        block.z = 1;

        //Set the parameter for the kernel, have to be the same order as in the definition
        std::vector<hpx::cuda::buffer>args;
        args.push_back(outbuffer);
        args.push_back(resbuffer);
        args.push_back(lengthbuffer);

        hpx::wait_all(data_futures);

        //Run the kernel at the default stream
        auto kernel_future = prog.run(args,"sum",grid,block);

        hpx::wait_all(kernel_future);

        //Copy the result back
        unsigned int* res = resbuffer.enqueue_read_sync<unsigned int>(0,sizeof(unsigned int));

        hpx::cout << "Result is " << res[0] << " and is ";

        //Check if result is correct

        if (res[0] != SIZE)
                hpx::cout << "wrong" << hpx::endl;
        else
                hpx::cout << "correct" << hpx::endl;

        return EXIT_SUCCESS;
}