From 809f924db2823b9e1eaf3efb9370380edc1f76ed Mon Sep 17 00:00:00 2001 From: Joseph Redmon <pjreddie@gmail.com> Date: Thu, 22 Jan 2015 16:38:24 -0800 Subject: [PATCH] CUDA so fast --- Makefile | 59 +-- src/activation_kernels.cu | 75 ++++ src/activations.c | 62 --- src/activations.cl | 62 --- src/activations.h | 7 +- src/axpy.c | 134 ------- src/axpy.cl | 24 -- src/blas.c | 28 ++ src/blas.h | 23 ++ src/blas_kernels.cu | 62 +++ src/cnn.c | 49 ++- src/col2im.c | 66 ---- src/col2im.h | 13 + src/{col2im.cl => col2im_kernels.cu} | 29 +- src/connected_layer.c | 70 ++-- src/connected_layer.h | 17 +- src/convolutional_kernels.cu | 164 ++++++++ src/convolutional_layer.c | 200 +--------- src/convolutional_layer.cl | 31 -- src/convolutional_layer.h | 27 +- src/cost_layer.c | 49 +-- src/cost_layer.h | 7 +- src/crop_layer.c | 45 +-- src/crop_layer.cl | 16 - src/crop_layer.h | 5 +- src/crop_layer_kernels.cu | 41 ++ src/cuda.c | 99 +++++ src/cuda.h | 21 + src/data.c | 1 + src/dropout_layer.c | 63 +-- src/dropout_layer.cl | 5 - src/dropout_layer.h | 9 +- src/dropout_layer_kernels.cu | 33 ++ src/gemm.c | 360 ++++-------------- src/gemm.cl | 217 ----------- src/gemm.h | 29 ++ src/gemm_fast.cl | 76 ---- src/im2col.c | 103 +---- src/im2col.h | 15 + src/{im2col.cl => im2col_kernels.cu} | 45 ++- src/maxpool_layer.c | 78 +--- src/maxpool_layer.h | 12 +- ...pool_layer.cl => maxpool_layer_kernels.cu} | 37 +- src/mini_blas.c | 67 ---- src/mini_blas.h | 70 ---- src/network.c | 14 +- src/network.h | 8 +- src/{network_gpu.c => network_kernels.cu} | 82 ++-- src/opencl.c | 222 ----------- src/opencl.h | 34 -- src/parser.c | 34 +- src/softmax_layer.c | 75 +--- src/softmax_layer.cl | 21 - src/softmax_layer.h | 10 +- src/softmax_layer_kernels.cu | 72 ++++ src/utils.c | 16 +- src/utils.h | 2 +- 57 files changed, 1115 insertions(+), 2180 deletions(-) create mode 100644 src/activation_kernels.cu delete mode 100644 src/activations.cl delete mode 100644 src/axpy.c delete mode 100644 src/axpy.cl create mode 100644 src/blas.c create mode 100644 src/blas.h create mode 100644 src/blas_kernels.cu create mode 100644 src/col2im.h rename src/{col2im.cl => col2im_kernels.cu} (61%) create mode 100644 src/convolutional_kernels.cu delete mode 100644 src/convolutional_layer.cl delete mode 100644 src/crop_layer.cl create mode 100644 src/crop_layer_kernels.cu create mode 100644 src/cuda.c create mode 100644 src/cuda.h delete mode 100644 src/dropout_layer.cl create mode 100644 src/dropout_layer_kernels.cu delete mode 100644 src/gemm.cl create mode 100644 src/gemm.h delete mode 100644 src/gemm_fast.cl create mode 100644 src/im2col.h rename src/{im2col.cl => im2col_kernels.cu} (57%) rename src/{maxpool_layer.cl => maxpool_layer_kernels.cu} (57%) delete mode 100644 src/mini_blas.c delete mode 100644 src/mini_blas.h rename src/{network_gpu.c => network_kernels.cu} (78%) delete mode 100644 src/opencl.c delete mode 100644 src/opencl.h delete mode 100644 src/softmax_layer.cl create mode 100644 src/softmax_layer_kernels.cu diff --git a/Makefile b/Makefile index 3bf04a3..e48e142 100644 --- a/Makefile +++ b/Makefile @@ -1,48 +1,49 @@ GPU=1 -CLBLAS=0 +DEBUG=0 + +VPATH=./src/ +EXEC=cnn +OBJDIR=./obj/ CC=gcc -COMMON=-Wall -Wfatal-errors `pkg-config --cflags opencv` -I/usr/local/cuda/include/ -ifeq ($(GPU), 1) -COMMON+=-DGPU +NVCC=nvcc +OPTS=-O3 +LINKER=$(CC) +LDFLAGS=`pkg-config --libs opencv` -lm -pthread +COMMON=`pkg-config --cflags opencv` -I/usr/local/cuda/include/ +CFLAGS=-Wall -Wfatal-errors +CFLAGS+=$(OPTS) + +ifeq ($(DEBUG), 1) +COMMON+=-O0 -g +CFLAGS+=-O0 -g endif -ifeq ($(CLBLAS), 1) -COMMON+=-DCLBLAS -LDFLAGS=-lclBLAS +ifeq ($(GPU), 1) +LINKER=$(NVCC) +COMMON+=-DGPU +CFLAGS+=-DGPU +LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas endif -UNAME = $(shell uname) -#OPTS=-Ofast -flto -OPTS=-O3 -flto -ifeq ($(UNAME), Darwin) -COMMON+= -isystem /usr/local/Cellar/opencv/2.4.6.1/include/opencv -isystem /usr/local/Cellar/opencv/2.4.6.1/include -ifeq ($(GPU), 1) -LDFLAGS= -framework OpenCL -endif -else -OPTS+= -march=native -ifeq ($(GPU), 1) -LDFLAGS+= -lOpenCL -endif +#OBJ=network.o network_gpu.o image.o cnn.o connected_layer.o maxpool_layer.o activations.o list.o option_list.o parser.o utils.o data.o matrix.o softmax_layer.o convolutional_layer.o gemm.o normalization_layer.o opencl.o im2col.o col2im.o axpy.o dropout_layer.o crop_layer.o freeweight_layer.o cost_layer.o server.o +OBJ=gemm.o utils.o cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o normalization_layer.o parser.o option_list.o cnn.o +ifeq ($(GPU), 1) +OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o softmax_layer_kernels.o network_kernels.o endif -CFLAGS= $(COMMON) $(OPTS) -#CFLAGS= $(COMMON) -O0 -g -LDFLAGS+=`pkg-config --libs opencv` -lm -pthread -VPATH=./src/ -EXEC=cnn -OBJDIR=./obj/ -OBJ=network.o network_gpu.o image.o cnn.o connected_layer.o maxpool_layer.o activations.o list.o option_list.o parser.o utils.o data.o matrix.o softmax_layer.o mini_blas.o convolutional_layer.o gemm.o normalization_layer.o opencl.o im2col.o col2im.o axpy.o dropout_layer.o crop_layer.o freeweight_layer.o cost_layer.o server.o OBJS = $(addprefix $(OBJDIR), $(OBJ)) all: $(EXEC) $(EXEC): $(OBJS) - $(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ + $(CC) $(COMMON) $(CFLAGS) $(LDFLAGS) $^ -o $@ $(OBJDIR)%.o: %.c - $(CC) $(CFLAGS) -c $< -o $@ + $(CC) $(COMMON) $(CFLAGS) -c $< -o $@ + +$(OBJDIR)%.o: %.cu + $(NVCC) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@ .PHONY: clean diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu new file mode 100644 index 0000000..7703d43 --- /dev/null +++ b/src/activation_kernels.cu @@ -0,0 +1,75 @@ +extern "C" { +#include "activations.h" +#include "cuda.h" +} + +__device__ float linear_activate_kernel(float x){return x;} +__device__ float sigmoid_activate_kernel(float x){return 1./(1. + exp(-x));} +__device__ float relu_activate_kernel(float x){return x*(x>0);} +__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1*x;} +//__device__ float ramp_activate_kernel(float x){return 0;} +__device__ float tanh_activate_kernel(float x){return (exp(2*x)-1)/(exp(2*x)+1);} + +__device__ float linear_gradient_kernel(float x){return 1;} +__device__ float sigmoid_gradient_kernel(float x){return (1-x)*x;} +__device__ float relu_gradient_kernel(float x){return (x>0);} +__device__ float ramp_gradient_kernel(float x){return (x>0)+.1;} +__device__ float tanh_gradient_kernel(float x){return 1-x*x;} + +__device__ float activate_kernel(float x, ACTIVATION a) +{ + switch(a){ + case LINEAR: + return linear_activate_kernel(x); + case SIGMOID: + return sigmoid_activate_kernel(x); + case RELU: + return relu_activate_kernel(x); + case RAMP: + return ramp_activate_kernel(x); + case TANH: + return tanh_activate_kernel(x); + } + return 0; +} + +__device__ float gradient_kernel(float x, ACTIVATION a) +{ + switch(a){ + case LINEAR: + return linear_gradient_kernel(x); + case SIGMOID: + return sigmoid_gradient_kernel(x); + case RELU: + return relu_gradient_kernel(x); + case RAMP: + return ramp_gradient_kernel(x); + case TANH: + return tanh_gradient_kernel(x); + } + return 0; +} + +__global__ void activate_array_kernel(float *x, int n, ACTIVATION a) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(i < n) x[i] = activate_kernel(x[i], a); +} + +__global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delta) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(i < n) delta[i] *= gradient_kernel(x[i], a); +} + +extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a) +{ + activate_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a); + check_error(cudaPeekAtLastError()); +} + +extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta) +{ + gradient_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a, delta); + check_error(cudaPeekAtLastError()); +} diff --git a/src/activations.c b/src/activations.c index 4232efa..48dce87 100644 --- a/src/activations.c +++ b/src/activations.c @@ -98,65 +98,3 @@ void gradient_array(const float *x, const int n, const ACTIVATION a, float *delt } } -#ifdef GPU - -#include "opencl.h" -#include <math.h> - -cl_kernel get_activation_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/activations.cl", "activate_array", 0); - init = 1; - } - return kernel; -} - -void activate_array_ongpu(cl_mem x, int n, ACTIVATION a) -{ - cl_kernel kernel = get_activation_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(x), (void*) &x); - cl.error = clSetKernelArg(kernel, i++, sizeof(n), (void*) &n); - cl.error = clSetKernelArg(kernel, i++, sizeof(a), (void*) &a); - check_error(cl); - - size_t gsize = n; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0); - check_error(cl); -} - -cl_kernel get_gradient_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/activations.cl", "gradient_array", 0); - init = 1; - } - return kernel; -} - -void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta) -{ - cl_kernel kernel = get_gradient_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(x), (void*) &x); - cl.error = clSetKernelArg(kernel, i++, sizeof(n), (void*) &n); - cl.error = clSetKernelArg(kernel, i++, sizeof(a), (void*) &a); - cl.error = clSetKernelArg(kernel, i++, sizeof(delta), (void*) &delta); - check_error(cl); - - size_t gsize = n; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0); - check_error(cl); -} -#endif diff --git a/src/activations.cl b/src/activations.cl deleted file mode 100644 index da06e8a..0000000 --- a/src/activations.cl +++ /dev/null @@ -1,62 +0,0 @@ -typedef enum{ - SIGMOID, RELU, LINEAR, RAMP, TANH -}ACTIVATION; - -float linear_activate(float x){return x;} -float sigmoid_activate(float x){return 1./(1. + exp(-x));} -float relu_activate(float x){return x*(x>0);} -float ramp_activate(float x){return x*(x>0)+.1*x;} -float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);} - -float linear_gradient(float x){return 1;} -float sigmoid_gradient(float x){return (1-x)*x;} -float relu_gradient(float x){return (x>0);} -float ramp_gradient(float x){return (x>0)+.1;} -float tanh_gradient(float x){return 1-x*x;} - -float activate(float x, ACTIVATION a) -{ - switch(a){ - case LINEAR: - return linear_activate(x); - case SIGMOID: - return sigmoid_activate(x); - case RELU: - return relu_activate(x); - case RAMP: - return ramp_activate(x); - case TANH: - return tanh_activate(x); - } - return 0; -} - -float gradient(float x, ACTIVATION a) -{ - switch(a){ - case LINEAR: - return linear_gradient(x); - case SIGMOID: - return sigmoid_gradient(x); - case RELU: - return relu_gradient(x); - case RAMP: - return ramp_gradient(x); - case TANH: - return tanh_gradient(x); - } - return 0; -} - -__kernel void activate_array(__global float *x, int n, ACTIVATION a) -{ - int i = get_global_id(0); - x[i] = activate(x[i], a); -} - -__kernel void gradient_array(__global float *x, int n, ACTIVATION a, __global float *delta) -{ - int i = get_global_id(0); - delta[i] *= gradient(x[i], a); -} - diff --git a/src/activations.h b/src/activations.h index c406c18..d9cf662 100644 --- a/src/activations.h +++ b/src/activations.h @@ -1,4 +1,4 @@ -#include "opencl.h" +#include "cuda.h" #ifndef ACTIVATIONS_H #define ACTIVATIONS_H @@ -14,9 +14,8 @@ float gradient(float x, ACTIVATION a); void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta); void activate_array(float *x, const int n, const ACTIVATION a); #ifdef GPU -cl_kernel get_activation_kernel(); -void activate_array_ongpu(cl_mem x, int n, ACTIVATION a); -void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta); +void activate_array_ongpu(float *x, int n, ACTIVATION a); +void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta); #endif #endif diff --git a/src/axpy.c b/src/axpy.c deleted file mode 100644 index 857579c..0000000 --- a/src/axpy.c +++ /dev/null @@ -1,134 +0,0 @@ -#include "mini_blas.h" - -void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) -{ - int i; - for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX]; -} - -void scal_cpu(int N, float ALPHA, float *X, int INCX) -{ - int i; - for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA; -} - -void copy_cpu(int N, float *X, int INCX, float *Y, int INCY) -{ - int i; - for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX]; -} - -float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) -{ - int i; - float dot = 0; - for(i = 0; i < N; ++i) dot += X[i*INCX] * Y[i*INCY]; - return dot; -} - -#ifdef GPU -#include "opencl.h" - -cl_kernel get_axpy_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/axpy.cl", "axpy", 0); - init = 1; - } - return kernel; -} - -cl_kernel get_copy_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/axpy.cl", "copy", 0); - init = 1; - } - return kernel; -} - -cl_kernel get_scal_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/axpy.cl", "scal", 0); - init = 1; - } - return kernel; -} - - -void axpy_ongpu(int N, float ALPHA, cl_mem X, int INCX, cl_mem Y, int INCY) -{ - axpy_ongpu_offset(N,ALPHA,X,0,INCX,Y,0,INCY); -} - -void axpy_ongpu_offset(int N, float ALPHA, cl_mem X, int OFFX, int INCX, cl_mem Y, int OFFY, int INCY) -{ - cl_kernel kernel = get_axpy_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(N), (void*) &N); - cl.error = clSetKernelArg(kernel, i++, sizeof(ALPHA), (void*) &ALPHA); - cl.error = clSetKernelArg(kernel, i++, sizeof(X), (void*) &X); - cl.error = clSetKernelArg(kernel, i++, sizeof(OFFX), (void*) &OFFX); - cl.error = clSetKernelArg(kernel, i++, sizeof(INCX), (void*) &INCX); - cl.error = clSetKernelArg(kernel, i++, sizeof(Y), (void*) &Y); - cl.error = clSetKernelArg(kernel, i++, sizeof(OFFY), (void*) &OFFY); - cl.error = clSetKernelArg(kernel, i++, sizeof(INCY), (void*) &INCY); - check_error(cl); - - const size_t global_size[] = {N}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); - check_error(cl); - -} -void copy_ongpu(int N, cl_mem X, int INCX, cl_mem Y, int INCY) -{ - copy_ongpu_offset(N,X,0,INCX,Y,0,INCY); -} -void copy_ongpu_offset(int N, cl_mem X, int OFFX, int INCX, cl_mem Y, int OFFY, int INCY) -{ - cl_kernel kernel = get_copy_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(N), (void*) &N); - cl.error = clSetKernelArg(kernel, i++, sizeof(X), (void*) &X); - cl.error = clSetKernelArg(kernel, i++, sizeof(OFFX), (void*) &OFFX); - cl.error = clSetKernelArg(kernel, i++, sizeof(INCX), (void*) &INCX); - cl.error = clSetKernelArg(kernel, i++, sizeof(Y), (void*) &Y); - cl.error = clSetKernelArg(kernel, i++, sizeof(OFFY), (void*) &OFFY); - cl.error = clSetKernelArg(kernel, i++, sizeof(INCY), (void*) &INCY); - check_error(cl); - - const size_t global_size[] = {N}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); - check_error(cl); -} -void scal_ongpu(int N, float ALPHA, cl_mem X, int INCX) -{ - cl_kernel kernel = get_scal_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(N), (void*) &N); - cl.error = clSetKernelArg(kernel, i++, sizeof(ALPHA), (void*) &ALPHA); - cl.error = clSetKernelArg(kernel, i++, sizeof(X), (void*) &X); - cl.error = clSetKernelArg(kernel, i++, sizeof(INCX), (void*) &INCX); - check_error(cl); - - const size_t global_size[] = {N}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); - check_error(cl); -} -#endif diff --git a/src/axpy.cl b/src/axpy.cl deleted file mode 100644 index 1503e8f..0000000 --- a/src/axpy.cl +++ /dev/null @@ -1,24 +0,0 @@ -__kernel void axpy(int N, float ALPHA, __global float *X, int OFFX, int INCX, __global float *Y, int OFFY, int INCY) -{ - int i = get_global_id(0); - Y[OFFY+i*INCY] += ALPHA*X[OFFX+i*INCX]; -} - -__kernel void scal(int N, float ALPHA, __global float *X, int INCX) -{ - int i = get_global_id(0); - X[i*INCX] *= ALPHA; -} - -__kernel void mask(int n, __global float *x, __global float *mask, int mod) -{ - int i = get_global_id(0); - x[i] = (i%mod && !mask[(i/mod)*mod]) ? 0 : x[i]; -} - -__kernel void copy(int N, __global float *X, int OFFX, int INCX, __global float *Y, int OFFY, int INCY) -{ - int i = get_global_id(0); - Y[i*INCY + OFFY] = X[i*INCX + OFFX]; -} - diff --git a/src/blas.c b/src/blas.c new file mode 100644 index 0000000..0f22330 --- /dev/null +++ b/src/blas.c @@ -0,0 +1,28 @@ +#include "blas.h" + +void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) +{ + int i; + for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX]; +} + +void scal_cpu(int N, float ALPHA, float *X, int INCX) +{ + int i; + for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA; +} + +void copy_cpu(int N, float *X, int INCX, float *Y, int INCY) +{ + int i; + for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX]; +} + +float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) +{ + int i; + float dot = 0; + for(i = 0; i < N; ++i) dot += X[i*INCX] * Y[i*INCY]; + return dot; +} + diff --git a/src/blas.h b/src/blas.h new file mode 100644 index 0000000..e7c976d --- /dev/null +++ b/src/blas.h @@ -0,0 +1,23 @@ +#ifndef BLAS_H +#define BLAS_H +void pm(int M, int N, float *A); +float *random_matrix(int rows, int cols); +void time_random_matrix(int TA, int TB, int m, int k, int n); + +void test_blas(); + +void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); +void copy_cpu(int N, float *X, int INCX, float *Y, int INCY); +void scal_cpu(int N, float ALPHA, float *X, int INCX); +float dot_cpu(int N, float *X, int INCX, float *Y, int INCY); +void test_gpu_blas(); + +#ifdef GPU +void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY); +void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); +void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY); +void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); +void scal_ongpu(int N, float ALPHA, float * X, int INCX); +void mask_ongpu(int N, float * X, float * mask, float mod); +#endif +#endif diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu new file mode 100644 index 0000000..d6f7143 --- /dev/null +++ b/src/blas_kernels.cu @@ -0,0 +1,62 @@ +extern "C" { +#include "blas.h" +#include "cuda.h" +} + +__global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(i < N) Y[OFFY+i*INCY] += ALPHA*X[OFFX+i*INCX]; +} + +__global__ void scal_kernel(int N, float ALPHA, float *X, int INCX) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(i < N) X[i*INCX] *= ALPHA; +} + +__global__ void mask_kernel(int n, float *x, float *mask, int mod) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(i < n) x[i] = (i%mod && !mask[(i/mod)*mod]) ? 0 : x[i]; +} + +__global__ void copy_kernel(int N, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(i < N) Y[i*INCY + OFFY] = X[i*INCX + OFFX]; +} + +extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) +{ + axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY); +} + +extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) +{ + axpy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY); + check_error(cudaPeekAtLastError()); +} + +extern "C" void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY) +{ + copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY); +} + +extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) +{ + copy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, OFFX, INCX, Y, OFFY, INCY); + check_error(cudaPeekAtLastError()); +} + +extern "C" void mask_ongpu(int N, float * X, float * mask, float mod) +{ + mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask, mod); + check_error(cudaPeekAtLastError()); +} + +extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX) +{ + scal_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX); + check_error(cudaPeekAtLastError()); +} diff --git a/src/cnn.c b/src/cnn.c index fed69d0..c3b7b2c 100644 --- a/src/cnn.c +++ b/src/cnn.c @@ -7,7 +7,7 @@ #include "data.h" #include "matrix.h" #include "utils.h" -#include "mini_blas.h" +#include "blas.h" #include "matrix.h" #include "server.h" @@ -165,6 +165,7 @@ void validate_detection_net(char *cfgfile) free_data(val); } } +/* void train_imagenet_distributed(char *address) { @@ -203,6 +204,7 @@ void train_imagenet_distributed(char *address) free_data(train); } } +*/ void train_imagenet(char *cfgfile) { @@ -210,10 +212,10 @@ void train_imagenet(char *cfgfile) //network net = parse_network_cfg("/home/pjreddie/imagenet_backup/alexnet_1270.cfg"); srand(time(0)); network net = parse_network_cfg(cfgfile); - set_learning_network(&net, net.learning_rate*100., net.momentum, net.decay); + set_learning_network(&net, net.learning_rate, net.momentum, net.decay); printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); - int imgs = 1024; - int i = 6600; + int imgs = 3072; + int i = net.seen/imgs; char **labels = get_labels("/home/pjreddie/data/imagenet/cls.labels.list"); list *plist = get_paths("/data/imagenet/cls.train.list"); char **paths = (char **)list_to_array(plist); @@ -224,19 +226,20 @@ void train_imagenet(char *cfgfile) data buffer; load_thread = load_data_thread(paths, imgs, plist->size, labels, 1000, 256, 256, &buffer); while(1){ - i += 1; + ++i; time=clock(); pthread_join(load_thread, 0); train = buffer; - normalize_data_rows(train); + //normalize_data_rows(train); //translate_data_rows(train, -128); //scale_data_rows(train, 1./128); load_thread = load_data_thread(paths, imgs, plist->size, labels, 1000, 256, 256, &buffer); printf("Loaded: %lf seconds\n", sec(clock()-time)); time=clock(); float loss = train_network(net, train); + net.seen += imgs; avg_loss = avg_loss*.9 + loss*.1; - printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), i*imgs); + printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen); free_data(train); if(i%100==0){ char buff[256]; @@ -272,7 +275,7 @@ void validate_imagenet(char *filename) pthread_join(load_thread, 0); val = buffer; - normalize_data_rows(val); + //normalize_data_rows(val); num = (i+1)*m/splits - i*m/splits; char **part = paths+(i*m/splits); @@ -466,6 +469,7 @@ void train_nist(char *cfgfile) save_network(net, buff); } +/* void train_nist_distributed(char *address) { srand(time(0)); @@ -487,6 +491,7 @@ void train_nist_distributed(char *address) printf("%d: Loss: %f, Time: %lf seconds\n", count, loss, (float)(end-start)/CLOCKS_PER_SEC); } } +*/ void test_ensemble() { @@ -537,10 +542,27 @@ void visualize_cat() cvWaitKey(0); } +void test_convolutional_layer() +{ + network net = parse_network_cfg("cfg/nist_conv.cfg"); + int size = get_network_input_size(net); + float *in = calloc(size, sizeof(float)); + int i; + for(i = 0; i < size; ++i) in[i] = rand_normal(); + float *in_gpu = cuda_make_array(in, size); + convolutional_layer layer = *(convolutional_layer *)net.layers[0]; + int out_size = convolutional_out_height(layer)*convolutional_out_width(layer)*layer.batch; + cuda_compare(layer.output_gpu, layer.output, out_size, "nothing"); + cuda_compare(layer.biases_gpu, layer.biases, layer.n, "biases"); + cuda_compare(layer.filters_gpu, layer.filters, layer.n*layer.size*layer.size*layer.c, "filters"); + bias_output(layer); + bias_output_gpu(layer); + cuda_compare(layer.output_gpu, layer.output, out_size, "biased output"); +} + void test_correct_nist() { network net = parse_network_cfg("cfg/nist_conv.cfg"); - test_learn_bias(*(convolutional_layer *)net.layers[0]); srand(222222); net = parse_network_cfg("cfg/nist_conv.cfg"); data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10); @@ -616,6 +638,7 @@ void test_correct_alexnet() } } +/* void run_server() { srand(time(0)); @@ -636,6 +659,7 @@ void test_client() printf("3\n"); printf("Transfered: %lf seconds\n", sec(clock()-time)); } +*/ void del_arg(int argc, char **argv, int index) { @@ -669,6 +693,7 @@ int find_int_arg(int argc, char **argv, char *arg, int def) int main(int argc, char **argv) { + //test_convolutional_layer(); if(argc < 2){ fprintf(stderr, "usage: %s <function>\n", argv[0]); return 0; @@ -680,7 +705,7 @@ int main(int argc, char **argv) gpu_index = -1; #else if(gpu_index >= 0){ - cl_setup(); + cudaSetDevice(gpu_index); } #endif @@ -688,7 +713,7 @@ int main(int argc, char **argv) else if(0==strcmp(argv[1], "test_correct")) test_correct_alexnet(); else if(0==strcmp(argv[1], "test_correct_nist")) test_correct_nist(); else if(0==strcmp(argv[1], "test")) test_imagenet(); - else if(0==strcmp(argv[1], "server")) run_server(); + //else if(0==strcmp(argv[1], "server")) run_server(); #ifdef GPU else if(0==strcmp(argv[1], "test_gpu")) test_gpu_blas(); @@ -701,7 +726,7 @@ int main(int argc, char **argv) else if(0==strcmp(argv[1], "detection")) train_detection_net(argv[2]); else if(0==strcmp(argv[1], "nist")) train_nist(argv[2]); else if(0==strcmp(argv[1], "train")) train_imagenet(argv[2]); - else if(0==strcmp(argv[1], "client")) train_imagenet_distributed(argv[2]); + //else if(0==strcmp(argv[1], "client")) train_imagenet_distributed(argv[2]); else if(0==strcmp(argv[1], "detect")) test_detection(argv[2]); else if(0==strcmp(argv[1], "init")) test_init(argv[2]); else if(0==strcmp(argv[1], "visualize")) test_visualize(argv[2]); diff --git a/src/col2im.c b/src/col2im.c index f986071..0fb3db4 100644 --- a/src/col2im.c +++ b/src/col2im.c @@ -41,69 +41,3 @@ void col2im_cpu(float* data_col, } } - -#ifdef GPU - -#include "opencl.h" - -cl_kernel get_col2im_kernel() -{ - static int init = 0; - static cl_kernel im2col_kernel; - if(!init){ - im2col_kernel = get_kernel("src/col2im.cl", "col2im", 0); - init = 1; - } - return im2col_kernel; -} - -void col2im_ongpu(cl_mem data_col, int offset, - int channels, int height, int width, - int ksize, int stride, int pad, cl_mem data_im) -{ - cl_kernel kernel = get_col2im_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(data_col), (void*) &data_col); - cl.error = clSetKernelArg(kernel, i++, sizeof(offset), (void*) &offset); - cl.error = clSetKernelArg(kernel, i++, sizeof(channels), (void*) &channels); - cl.error = clSetKernelArg(kernel, i++, sizeof(height), (void*) &height); - cl.error = clSetKernelArg(kernel, i++, sizeof(width), (void*) &width); - cl.error = clSetKernelArg(kernel, i++, sizeof(ksize), (void*) &ksize); - cl.error = clSetKernelArg(kernel, i++, sizeof(stride), (void*) &stride); - cl.error = clSetKernelArg(kernel, i++, sizeof(pad), (void*) &pad); - cl.error = clSetKernelArg(kernel, i++, sizeof(data_im), (void*) &data_im); - check_error(cl); - - size_t global_size = channels*height*width; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, - &global_size, 0, 0, 0, 0); - check_error(cl); -} - -/* - void col2im_gpu(float *data_col, int batch, - int channels, int height, int width, - int ksize, int stride, int pad, float *data_im) - { - int height_col = (height - ksize) / stride + 1; - int width_col = (width - ksize) / stride + 1; - int channels_col = channels * ksize * ksize; - - size_t size = height_col*width_col*channels_col*batch; - cl_mem col_gpu = cl_make_array(data_col, size); - size = channels*height*width*batch; - cl_mem im_gpu = cl_make_array(data_im, size); - - col2im_ongpu(col_gpu, batch, channels, height, width, - ksize, stride, pad, im_gpu); - - cl_read_array(im_gpu, data_im, size); - clReleaseMemObject(col_gpu); - clReleaseMemObject(im_gpu); - } - */ - -#endif diff --git a/src/col2im.h b/src/col2im.h new file mode 100644 index 0000000..2fdc427 --- /dev/null +++ b/src/col2im.h @@ -0,0 +1,13 @@ +#ifndef COL2IM_H +#define COL2IM_H + +void col2im_cpu(float* data_col, + int channels, int height, int width, + int ksize, int stride, int pad, float* data_im); + +#ifdef GPU +void col2im_ongpu(float *data_col, int batch, + int channels, int height, int width, + int ksize, int stride, int pad, float *data_im); +#endif +#endif diff --git a/src/col2im.cl b/src/col2im_kernels.cu similarity index 61% rename from src/col2im.cl rename to src/col2im_kernels.cu index 617e818..73de9b7 100644 --- a/src/col2im.cl +++ b/src/col2im_kernels.cu @@ -1,6 +1,11 @@ -__kernel void col2im(__global float *data_col, int offset, +extern "C" { +#include "col2im.h" +#include "cuda.h" +} + +__global__ void col2im_kernel(float *data_col, int offset, int channels, int height, int width, - int ksize, int stride, int pad, __global float *data_im) + int ksize, int stride, int pad, float *data_im) { int height_col = (height - ksize) / stride + 1; @@ -11,7 +16,9 @@ __kernel void col2im(__global float *data_col, int offset, pad = ksize/2; } - int id = get_global_id(0); + int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(id >= channels*height*width) return; + int index = id; int w = id%width + pad; id /= width; @@ -25,8 +32,8 @@ __kernel void col2im(__global float *data_col, int offset, int h_start = (h-ksize+stride)/stride; int h_end = h/stride + 1; - int rows = channels * ksize * ksize; - int cols = height_col*width_col; + // int rows = channels * ksize * ksize; + // int cols = height_col*width_col; int col_offset = (c*ksize*ksize + h * ksize + w)*height_col*width_col; int h_coeff = (1-stride*ksize*height_col)*width_col; int w_coeff = 1-stride*height_col*width_col; @@ -41,3 +48,15 @@ __kernel void col2im(__global float *data_col, int offset, } data_im[index+offset] = val; } + + +extern "C" void col2im_ongpu(float *data_col, int offset, + int channels, int height, int width, + int ksize, int stride, int pad, float *data_im) +{ + + size_t n = channels*height*width; + + col2im_kernel<<<cuda_gridsize(n), BLOCK>>>(data_col, offset, channels, height, width, ksize, stride, pad, data_im); + check_error(cudaPeekAtLastError()); +} diff --git a/src/connected_layer.c b/src/connected_layer.c index e29df77..254d39e 100644 --- a/src/connected_layer.c +++ b/src/connected_layer.c @@ -1,6 +1,8 @@ #include "connected_layer.h" #include "utils.h" -#include "mini_blas.h" +#include "cuda.h" +#include "blas.h" +#include "gemm.h" #include <math.h> #include <stdio.h> @@ -44,14 +46,14 @@ connected_layer *make_connected_layer(int batch, int inputs, int outputs, ACTIVA } #ifdef GPU - layer->weights_cl = cl_make_array(layer->weights, inputs*outputs); - layer->biases_cl = cl_make_array(layer->biases, outputs); + layer->weights_gpu = cuda_make_array(layer->weights, inputs*outputs); + layer->biases_gpu = cuda_make_array(layer->biases, outputs); - layer->weight_updates_cl = cl_make_array(layer->weight_updates, inputs*outputs); - layer->bias_updates_cl = cl_make_array(layer->bias_updates, outputs); + layer->weight_updates_gpu = cuda_make_array(layer->weight_updates, inputs*outputs); + layer->bias_updates_gpu = cuda_make_array(layer->bias_updates, outputs); - layer->output_cl = cl_make_array(layer->output, outputs*batch); - layer->delta_cl = cl_make_array(layer->delta, outputs*batch); + layer->output_gpu = cuda_make_array(layer->output, outputs*batch); + layer->delta_gpu = cuda_make_array(layer->delta, outputs*batch); #endif layer->activation = activation; fprintf(stderr, "Connected Layer: %d inputs, %d outputs\n", inputs, outputs); @@ -140,68 +142,68 @@ void backward_connected_layer(connected_layer layer, float *input, float *delta) void pull_connected_layer(connected_layer layer) { - cl_read_array(layer.weights_cl, layer.weights, layer.inputs*layer.outputs); - cl_read_array(layer.biases_cl, layer.biases, layer.outputs); - cl_read_array(layer.weight_updates_cl, layer.weight_updates, layer.inputs*layer.outputs); - cl_read_array(layer.bias_updates_cl, layer.bias_updates, layer.outputs); + cuda_pull_array(layer.weights_gpu, layer.weights, layer.inputs*layer.outputs); + cuda_pull_array(layer.biases_gpu, layer.biases, layer.outputs); + cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.inputs*layer.outputs); + cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.outputs); } void push_connected_layer(connected_layer layer) { - cl_write_array(layer.weights_cl, layer.weights, layer.inputs*layer.outputs); - cl_write_array(layer.biases_cl, layer.biases, layer.outputs); - cl_write_array(layer.weight_updates_cl, layer.weight_updates, layer.inputs*layer.outputs); - cl_write_array(layer.bias_updates_cl, layer.bias_updates, layer.outputs); + cuda_push_array(layer.weights_gpu, layer.weights, layer.inputs*layer.outputs); + cuda_push_array(layer.biases_gpu, layer.biases, layer.outputs); + cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.inputs*layer.outputs); + cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.outputs); } void update_connected_layer_gpu(connected_layer layer) { - axpy_ongpu(layer.outputs, layer.learning_rate, layer.bias_updates_cl, 1, layer.biases_cl, 1); - scal_ongpu(layer.outputs, layer.momentum, layer.bias_updates_cl, 1); + axpy_ongpu(layer.outputs, layer.learning_rate, layer.bias_updates_gpu, 1, layer.biases_gpu, 1); + scal_ongpu(layer.outputs, layer.momentum, layer.bias_updates_gpu, 1); - axpy_ongpu(layer.inputs*layer.outputs, -layer.decay, layer.weights_cl, 1, layer.weight_updates_cl, 1); - axpy_ongpu(layer.inputs*layer.outputs, layer.learning_rate, layer.weight_updates_cl, 1, layer.weights_cl, 1); - scal_ongpu(layer.inputs*layer.outputs, layer.momentum, layer.weight_updates_cl, 1); + axpy_ongpu(layer.inputs*layer.outputs, -layer.decay, layer.weights_gpu, 1, layer.weight_updates_gpu, 1); + axpy_ongpu(layer.inputs*layer.outputs, layer.learning_rate, layer.weight_updates_gpu, 1, layer.weights_gpu, 1); + scal_ongpu(layer.inputs*layer.outputs, layer.momentum, layer.weight_updates_gpu, 1); //pull_connected_layer(layer); } -void forward_connected_layer_gpu(connected_layer layer, cl_mem input) +void forward_connected_layer_gpu(connected_layer layer, float * input) { int i; for(i = 0; i < layer.batch; ++i){ - copy_ongpu_offset(layer.outputs, layer.biases_cl, 0, 1, layer.output_cl, i*layer.outputs, 1); + copy_ongpu_offset(layer.outputs, layer.biases_gpu, 0, 1, layer.output_gpu, i*layer.outputs, 1); } int m = layer.batch; int k = layer.inputs; int n = layer.outputs; - cl_mem a = input; - cl_mem b = layer.weights_cl; - cl_mem c = layer.output_cl; + float * a = input; + float * b = layer.weights_gpu; + float * c = layer.output_gpu; gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n); - activate_array_ongpu(layer.output_cl, layer.outputs*layer.batch, layer.activation); + activate_array_ongpu(layer.output_gpu, layer.outputs*layer.batch, layer.activation); } -void backward_connected_layer_gpu(connected_layer layer, cl_mem input, cl_mem delta) +void backward_connected_layer_gpu(connected_layer layer, float * input, float * delta) { int i; - gradient_array_ongpu(layer.output_cl, layer.outputs*layer.batch, layer.activation, layer.delta_cl); + gradient_array_ongpu(layer.output_gpu, layer.outputs*layer.batch, layer.activation, layer.delta_gpu); for(i = 0; i < layer.batch; ++i){ - axpy_ongpu_offset(layer.outputs, 1, layer.delta_cl, i*layer.outputs, 1, layer.bias_updates_cl, 0, 1); + axpy_ongpu_offset(layer.outputs, 1, layer.delta_gpu, i*layer.outputs, 1, layer.bias_updates_gpu, 0, 1); } int m = layer.inputs; int k = layer.batch; int n = layer.outputs; - cl_mem a = input; - cl_mem b = layer.delta_cl; - cl_mem c = layer.weight_updates_cl; + float * a = input; + float * b = layer.delta_gpu; + float * c = layer.weight_updates_gpu; gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n); m = layer.batch; k = layer.outputs; n = layer.inputs; - a = layer.delta_cl; - b = layer.weights_cl; + a = layer.delta_gpu; + b = layer.weights_gpu; c = delta; if(c) gemm_ongpu(0,1,m,n,k,1,a,k,b,k,0,c,n); diff --git a/src/connected_layer.h b/src/connected_layer.h index c3b8cb1..921f06f 100644 --- a/src/connected_layer.h +++ b/src/connected_layer.h @@ -2,7 +2,6 @@ #define CONNECTED_LAYER_H #include "activations.h" -#include "opencl.h" typedef struct{ float learning_rate; @@ -25,14 +24,14 @@ typedef struct{ float *delta; #ifdef GPU - cl_mem weights_cl; - cl_mem biases_cl; + float * weights_gpu; + float * biases_gpu; - cl_mem weight_updates_cl; - cl_mem bias_updates_cl; + float * weight_updates_gpu; + float * bias_updates_gpu; - cl_mem output_cl; - cl_mem delta_cl; + float * output_gpu; + float * delta_gpu; #endif ACTIVATION activation; @@ -46,8 +45,8 @@ void backward_connected_layer(connected_layer layer, float *input, float *delta) void update_connected_layer(connected_layer layer); #ifdef GPU -void forward_connected_layer_gpu(connected_layer layer, cl_mem input); -void backward_connected_layer_gpu(connected_layer layer, cl_mem input, cl_mem delta); +void forward_connected_layer_gpu(connected_layer layer, float * input); +void backward_connected_layer_gpu(connected_layer layer, float * input, float * delta); void update_connected_layer_gpu(connected_layer layer); void push_connected_layer(connected_layer layer); void pull_connected_layer(connected_layer layer); diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu new file mode 100644 index 0000000..6461aff --- /dev/null +++ b/src/convolutional_kernels.cu @@ -0,0 +1,164 @@ +extern "C" { +#include "convolutional_layer.h" +#include "gemm.h" +#include "blas.h" +#include "im2col.h" +#include "col2im.h" +#include "utils.h" +#include "cuda.h" +} + +__global__ void bias(int n, int size, float *biases, float *output) +{ + int offset = blockIdx.x * blockDim.x + threadIdx.x; + int filter = blockIdx.y; + int batch = blockIdx.z; + + if(offset < size) output[(batch*n+filter)*size + offset] = biases[filter]; +} + +extern "C" void bias_output_gpu(const convolutional_layer layer) +{ + int size = convolutional_out_height(layer)*convolutional_out_width(layer); + + dim3 dimBlock(BLOCK, 1, 1); + dim3 dimGrid((size-1)/BLOCK + 1, layer.n, layer.batch); + + bias<<<dimGrid, dimBlock>>>(layer.n, size, layer.biases_gpu, layer.output_gpu); + check_error(cudaPeekAtLastError()); +} + +__global__ void learn_bias(int batch, int n, int size, float *delta, float *bias_updates) +{ + __shared__ float part[BLOCK]; + int i,b; + int filter = (blockIdx.x + blockIdx.y*gridDim.x); + int p = threadIdx.x; + float sum = 0; + for(b = 0; b < batch; ++b){ + for(i = 0; i < size; i += BLOCK){ + int index = p + i + size*(filter + n*b); + sum += (p+i < size) ? delta[index] : 0; + } + } + part[p] = sum; + __syncthreads(); + if(p == 0){ + for(i = 0; i < BLOCK; ++i) bias_updates[filter] += part[i]; + } +} + +extern "C" void learn_bias_convolutional_layer_ongpu(convolutional_layer layer) +{ + int size = convolutional_out_height(layer)*convolutional_out_width(layer); + + + learn_bias<<<cuda_gridsize(layer.n), BLOCK>>>(layer.batch, layer.n, size, layer.delta_gpu, layer.bias_updates_gpu); + check_error(cudaPeekAtLastError()); +} + +extern "C" void test_learn_bias(convolutional_layer l) +{ + int i; + int size = convolutional_out_height(l) * convolutional_out_width(l); + for(i = 0; i < size*l.batch*l.n; ++i){ + l.delta[i] = rand_uniform(); + } + for(i = 0; i < l.n; ++i){ + l.bias_updates[i] = rand_uniform(); + } + cuda_push_array(l.delta_gpu, l.delta, size*l.batch*l.n); + cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n); + float *gpu = (float *) calloc(l.n, sizeof(float)); + cuda_pull_array(l.bias_updates_gpu, gpu, l.n); + for(i = 0; i < l.n; ++i) printf("%.9g %.9g\n", l.bias_updates[i], gpu[i]); + learn_bias_convolutional_layer_ongpu(l); + learn_bias_convolutional_layer(l); + cuda_pull_array(l.bias_updates_gpu, gpu, l.n); + for(i = 0; i < l.n; ++i) printf("%.9g %.9g\n", l.bias_updates[i], gpu[i]); +} + +extern "C" void forward_convolutional_layer_gpu(convolutional_layer layer, float *in) +{ + int i; + int m = layer.n; + int k = layer.size*layer.size*layer.c; + int n = convolutional_out_height(layer)* + convolutional_out_width(layer); + + bias_output_gpu(layer); + + for(i = 0; i < layer.batch; ++i){ + im2col_ongpu(in, i*layer.c*layer.h*layer.w, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, layer.col_image_gpu); + float * a = layer.filters_gpu; + float * b = layer.col_image_gpu; + float * c = layer.output_gpu; + gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n); + } + activate_array_ongpu(layer.output_gpu, m*n*layer.batch, layer.activation); + cuda_pull_array(layer.output_gpu, layer.output, m*n*layer.batch); + //for(i = 0; i < m*n*layer.batch; ++i) printf("%f, ", layer.output[i]); + //printf("\n"); +} + +extern "C" void backward_convolutional_layer_gpu(convolutional_layer layer, float *in, float *delta_gpu) +{ + int i; + int m = layer.n; + int n = layer.size*layer.size*layer.c; + int k = convolutional_out_height(layer)* + convolutional_out_width(layer); + gradient_array_ongpu(layer.output_gpu, m*k*layer.batch, layer.activation, layer.delta_gpu); + learn_bias_convolutional_layer_ongpu(layer); + + if(delta_gpu) scal_ongpu(layer.batch*layer.h*layer.w*layer.c, 0, delta_gpu, 1); + + for(i = 0; i < layer.batch; ++i){ + float * a = layer.delta_gpu; + float * b = layer.col_image_gpu; + float * c = layer.filter_updates_gpu; + + im2col_ongpu(in, i*layer.c*layer.h*layer.w, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, layer.col_image_gpu); + gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n); + + if(delta_gpu){ + + float * a = layer.filters_gpu; + float * b = layer.delta_gpu; + float * c = layer.col_image_gpu; + + gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k); + + col2im_ongpu(layer.col_image_gpu, i*layer.c*layer.h*layer.w, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta_gpu); + } + } +} + +extern "C" void pull_convolutional_layer(convolutional_layer layer) +{ + cuda_pull_array(layer.filters_gpu, layer.filters, layer.c*layer.n*layer.size*layer.size); + cuda_pull_array(layer.biases_gpu, layer.biases, layer.n); + cuda_pull_array(layer.filter_updates_gpu, layer.filter_updates, layer.c*layer.n*layer.size*layer.size); + cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n); +} + +extern "C" void push_convolutional_layer(convolutional_layer layer) +{ + cuda_push_array(layer.filters_gpu, layer.filters, layer.c*layer.n*layer.size*layer.size); + cuda_push_array(layer.biases_gpu, layer.biases, layer.n); + cuda_push_array(layer.filter_updates_gpu, layer.filter_updates, layer.c*layer.n*layer.size*layer.size); + cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n); +} + +extern "C" void update_convolutional_layer_gpu(convolutional_layer layer) +{ + int size = layer.size*layer.size*layer.c*layer.n; + axpy_ongpu(layer.n, layer.learning_rate, layer.bias_updates_gpu, 1, layer.biases_gpu, 1); + scal_ongpu(layer.n,layer.momentum, layer.bias_updates_gpu, 1); + + axpy_ongpu(size, -layer.decay, layer.filters_gpu, 1, layer.filter_updates_gpu, 1); + axpy_ongpu(size, layer.learning_rate, layer.filter_updates_gpu, 1, layer.filters_gpu, 1); + scal_ongpu(size, layer.momentum, layer.filter_updates_gpu, 1); + //pull_convolutional_layer(layer); +} + diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 4e8c44b..6848511 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -1,6 +1,9 @@ #include "convolutional_layer.h" #include "utils.h" -#include "mini_blas.h" +#include "im2col.h" +#include "col2im.h" +#include "blas.h" +#include "gemm.h" #include <stdio.h> #include <time.h> @@ -77,15 +80,15 @@ convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, in layer->delta = calloc(layer->batch*out_h * out_w * n, sizeof(float)); #ifdef GPU - layer->filters_cl = cl_make_array(layer->filters, c*n*size*size); - layer->filter_updates_cl = cl_make_array(layer->filter_updates, c*n*size*size); + layer->filters_gpu = cuda_make_array(layer->filters, c*n*size*size); + layer->filter_updates_gpu = cuda_make_array(layer->filter_updates, c*n*size*size); - layer->biases_cl = cl_make_array(layer->biases, n); - layer->bias_updates_cl = cl_make_array(layer->bias_updates, n); + layer->biases_gpu = cuda_make_array(layer->biases, n); + layer->bias_updates_gpu = cuda_make_array(layer->bias_updates, n); - layer->col_image_cl = cl_make_array(layer->col_image, out_h*out_w*size*size*c); - layer->delta_cl = cl_make_array(layer->delta, layer->batch*out_h*out_w*n); - layer->output_cl = cl_make_array(layer->output, layer->batch*out_h*out_w*n); + layer->col_image_gpu = cuda_make_array(layer->col_image, out_h*out_w*size*size*c); + layer->delta_gpu = cuda_make_array(layer->delta, layer->batch*out_h*out_w*n); + layer->output_gpu = cuda_make_array(layer->output, layer->batch*out_h*out_w*n); #endif layer->activation = activation; @@ -140,7 +143,6 @@ void forward_convolutional_layer(const convolutional_layer layer, float *in) float *b = layer.col_image; float *c = layer.output; - for(i = 0; i < layer.batch; ++i){ im2col_cpu(in, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, b); @@ -265,183 +267,3 @@ image *visualize_convolutional_layer(convolutional_layer layer, char *window, im return single_filters; } -#ifdef GPU -#define BLOCK 32 - -#define STR_HELPER(x) #x -#define STR(x) STR_HELPER(x) - - -cl_kernel get_convolutional_learn_bias_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/convolutional_layer.cl", "learn_bias", "-D BLOCK=" STR(BLOCK)); - init = 1; - } - return kernel; -} - -void learn_bias_convolutional_layer_ongpu(convolutional_layer layer) -{ - int size = convolutional_out_height(layer) * convolutional_out_width(layer); - - cl_kernel kernel = get_convolutional_learn_bias_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.batch), (void*) &layer.batch); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n); - cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.delta_cl), (void*) &layer.delta_cl); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.bias_updates_cl), (void*) &layer.bias_updates_cl); - check_error(cl); - - const size_t global_size[] = {layer.n*BLOCK}; - const size_t local_size[] = {BLOCK}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, local_size, 0, 0, 0); - check_error(cl); -} - -void test_learn_bias(convolutional_layer l) -{ - int i; - int size = convolutional_out_height(l) * convolutional_out_width(l); - for(i = 0; i < size*l.batch*l.n; ++i){ - l.delta[i] = rand_uniform(); - } - for(i = 0; i < l.n; ++i){ - l.bias_updates[i] = rand_uniform(); - } - cl_write_array(l.delta_cl, l.delta, size*l.batch*l.n); - cl_write_array(l.bias_updates_cl, l.bias_updates, l.n); - float *gpu = calloc(l.n, sizeof(float)); - cl_read_array(l.bias_updates_cl, gpu, l.n); - for(i = 0; i < l.n; ++i) printf("%.9g %.9g\n", l.bias_updates[i], gpu[i]); - learn_bias_convolutional_layer_ongpu(l); - learn_bias_convolutional_layer(l); - cl_read_array(l.bias_updates_cl, gpu, l.n); - for(i = 0; i < l.n; ++i) printf("%.9g %.9g\n", l.bias_updates[i], gpu[i]); -} - -cl_kernel get_convolutional_bias_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/convolutional_layer.cl", "bias", "-D BLOCK=" STR(BLOCK)); - init = 1; - } - return kernel; -} - -void bias_output_gpu(const convolutional_layer layer) -{ - int out_h = convolutional_out_height(layer); - int out_w = convolutional_out_width(layer); - int size = out_h*out_w; - - cl_kernel kernel = get_convolutional_bias_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n); - cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.biases_cl), (void*) &layer.biases_cl); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl); - check_error(cl); - - const size_t global_size[] = {layer.n*size, layer.batch}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 2, 0, global_size, 0, 0, 0, 0); - check_error(cl); -} - -//#define TIMEIT - -void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in) -{ - int i; - int m = layer.n; - int k = layer.size*layer.size*layer.c; - int n = convolutional_out_height(layer)* - convolutional_out_width(layer); - - bias_output_gpu(layer); - - for(i = 0; i < layer.batch; ++i){ - im2col_ongpu(in, i*layer.c*layer.h*layer.w, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, layer.col_image_cl); - cl_mem a = layer.filters_cl; - cl_mem b = layer.col_image_cl; - cl_mem c = layer.output_cl; - gemm_ongpu_offset(0,0,m,n,k,1.,a,0,k,b,0,n,1.,c,i*m*n,n); - } - activate_array_ongpu(layer.output_cl, m*n*layer.batch, layer.activation); -} - -void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in, cl_mem delta_cl) -{ - int i; - int m = layer.n; - int n = layer.size*layer.size*layer.c; - int k = convolutional_out_height(layer)* - convolutional_out_width(layer); - gradient_array_ongpu(layer.output_cl, m*k*layer.batch, layer.activation, layer.delta_cl); - learn_bias_convolutional_layer_ongpu(layer); - - if(delta_cl) scal_ongpu(layer.batch*layer.h*layer.w*layer.c, 0, delta_cl, 1); - - for(i = 0; i < layer.batch; ++i){ - cl_mem a = layer.delta_cl; - cl_mem b = layer.col_image_cl; - cl_mem c = layer.filter_updates_cl; - - im2col_ongpu(in, i*layer.c*layer.h*layer.w, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, layer.col_image_cl); - gemm_ongpu_offset(0,1,m,n,k,1,a,i*m*k,k,b,0,k,1,c,0,n); - - if(delta_cl){ - - cl_mem a = layer.filters_cl; - cl_mem b = layer.delta_cl; - cl_mem c = layer.col_image_cl; - - gemm_ongpu_offset(1,0,n,k,m,1,a,0,n,b,i*k*m,k,0,c,0,k); - - col2im_ongpu(layer.col_image_cl, i*layer.c*layer.h*layer.w, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta_cl); - } - } -} - -void pull_convolutional_layer(convolutional_layer layer) -{ - cl_read_array(layer.filters_cl, layer.filters, layer.c*layer.n*layer.size*layer.size); - cl_read_array(layer.biases_cl, layer.biases, layer.n); - cl_read_array(layer.filter_updates_cl, layer.filter_updates, layer.c*layer.n*layer.size*layer.size); - cl_read_array(layer.bias_updates_cl, layer.bias_updates, layer.n); -} - -void push_convolutional_layer(convolutional_layer layer) -{ - cl_write_array(layer.filters_cl, layer.filters, layer.c*layer.n*layer.size*layer.size); - cl_write_array(layer.biases_cl, layer.biases, layer.n); - cl_write_array(layer.filter_updates_cl, layer.filter_updates, layer.c*layer.n*layer.size*layer.size); - cl_write_array(layer.bias_updates_cl, layer.bias_updates, layer.n); -} - -void update_convolutional_layer_gpu(convolutional_layer layer) -{ - int size = layer.size*layer.size*layer.c*layer.n; - axpy_ongpu(layer.n, layer.learning_rate, layer.bias_updates_cl, 1, layer.biases_cl, 1); - scal_ongpu(layer.n,layer.momentum, layer.bias_updates_cl, 1); - - axpy_ongpu(size, -layer.decay, layer.filters_cl, 1, layer.filter_updates_cl, 1); - axpy_ongpu(size, layer.learning_rate, layer.filter_updates_cl, 1, layer.filters_cl, 1); - scal_ongpu(size, layer.momentum, layer.filter_updates_cl, 1); - //pull_convolutional_layer(layer); -} - - -#endif - diff --git a/src/convolutional_layer.cl b/src/convolutional_layer.cl deleted file mode 100644 index 903471b..0000000 --- a/src/convolutional_layer.cl +++ /dev/null @@ -1,31 +0,0 @@ - -__kernel void bias(int n, int size, __global float *biases, __global float *output) -{ - int id = get_global_id(0); - int batch = get_global_id(1); - int filter = id/size; - //int position = id%size; - - output[batch*n*size + id] = biases[filter]; -} - -__kernel void learn_bias(int batch, int n, int size, __global float *delta, __global float *bias_updates) -{ - __local float part[BLOCK]; - int i,b; - int filter = get_group_id(0); - int p = get_local_id(0); - float sum = 0; - for(b = 0; b < batch; ++b){ - for(i = 0; i < size; i += BLOCK){ - int index = p + i + size*(filter + n*b); - sum += (p+i < size) ? delta[index] : 0; - } - } - part[p] = sum; - barrier(CLK_LOCAL_MEM_FENCE); - if(p == 0){ - for(i = 0; i < BLOCK; ++i) bias_updates[filter] += part[i]; - } -} - diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h index c805aa2..c69686f 100644 --- a/src/convolutional_layer.h +++ b/src/convolutional_layer.h @@ -1,7 +1,7 @@ #ifndef CONVOLUTIONAL_LAYER_H #define CONVOLUTIONAL_LAYER_H -#include "opencl.h" +#include "cuda.h" #include "image.h" #include "activations.h" @@ -27,26 +27,28 @@ typedef struct { float *output; #ifdef GPU - cl_mem filters_cl; - cl_mem filter_updates_cl; + float * filters_gpu; + float * filter_updates_gpu; - cl_mem biases_cl; - cl_mem bias_updates_cl; + float * biases_gpu; + float * bias_updates_gpu; - cl_mem col_image_cl; - cl_mem delta_cl; - cl_mem output_cl; + float * col_image_gpu; + float * delta_gpu; + float * output_gpu; #endif ACTIVATION activation; } convolutional_layer; #ifdef GPU -void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in); -void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in, cl_mem delta_cl); +void forward_convolutional_layer_gpu(convolutional_layer layer, float * in); +void backward_convolutional_layer_gpu(convolutional_layer layer, float * in, float * delta_gpu); void update_convolutional_layer_gpu(convolutional_layer layer); void push_convolutional_layer(convolutional_layer layer); void pull_convolutional_layer(convolutional_layer layer); +void learn_bias_convolutional_layer_ongpu(convolutional_layer layer); +void bias_output_gpu(const convolutional_layer layer); #endif convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay); @@ -57,9 +59,14 @@ image *visualize_convolutional_layer(convolutional_layer layer, char *window, im void backward_convolutional_layer(convolutional_layer layer, float *in, float *delta); +void bias_output(const convolutional_layer layer); image get_convolutional_image(convolutional_layer layer); image get_convolutional_delta(convolutional_layer layer); image get_convolutional_filter(convolutional_layer layer, int i); +int convolutional_out_height(convolutional_layer layer); +int convolutional_out_width(convolutional_layer layer); +void learn_bias_convolutional_layer(convolutional_layer layer); + #endif diff --git a/src/cost_layer.c b/src/cost_layer.c index 6951956..a08562b 100644 --- a/src/cost_layer.c +++ b/src/cost_layer.c @@ -1,6 +1,7 @@ #include "cost_layer.h" #include "utils.h" -#include "mini_blas.h" +#include "cuda.h" +#include "blas.h" #include <math.h> #include <string.h> #include <stdlib.h> @@ -35,7 +36,7 @@ cost_layer *make_cost_layer(int batch, int inputs, COST_TYPE type) layer->delta = calloc(inputs*batch, sizeof(float)); layer->output = calloc(1, sizeof(float)); #ifdef GPU - layer->delta_cl = cl_make_array(layer->delta, inputs*batch); + layer->delta_gpu = cuda_make_array(layer->delta, inputs*batch); #endif return layer; } @@ -62,55 +63,25 @@ void backward_cost_layer(const cost_layer layer, float *input, float *delta) #ifdef GPU -cl_kernel get_mask_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/axpy.cl", "mask", 0); - init = 1; - } - return kernel; -} - -void mask_ongpu(int n, cl_mem x, cl_mem mask, int mod) -{ - cl_kernel kernel = get_mask_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(n), (void*) &n); - cl.error = clSetKernelArg(kernel, i++, sizeof(x), (void*) &x); - cl.error = clSetKernelArg(kernel, i++, sizeof(mask), (void*) &mask); - cl.error = clSetKernelArg(kernel, i++, sizeof(mod), (void*) &mod); - check_error(cl); - - const size_t global_size[] = {n}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); - check_error(cl); - -} - -void forward_cost_layer_gpu(cost_layer layer, cl_mem input, cl_mem truth) +void forward_cost_layer_gpu(cost_layer layer, float * input, float * truth) { if (!truth) return; - copy_ongpu(layer.batch*layer.inputs, truth, 1, layer.delta_cl, 1); - axpy_ongpu(layer.batch*layer.inputs, -1, input, 1, layer.delta_cl, 1); + copy_ongpu(layer.batch*layer.inputs, truth, 1, layer.delta_gpu, 1); + axpy_ongpu(layer.batch*layer.inputs, -1, input, 1, layer.delta_gpu, 1); if(layer.type==DETECTION){ - mask_ongpu(layer.inputs*layer.batch, layer.delta_cl, truth, 5); + mask_ongpu(layer.inputs*layer.batch, layer.delta_gpu, truth, 5); } - cl_read_array(layer.delta_cl, layer.delta, layer.batch*layer.inputs); + cuda_pull_array(layer.delta_gpu, layer.delta, layer.batch*layer.inputs); *(layer.output) = dot_cpu(layer.batch*layer.inputs, layer.delta, 1, layer.delta, 1); //printf("cost: %f\n", *layer.output); } -void backward_cost_layer_gpu(const cost_layer layer, cl_mem input, cl_mem delta) +void backward_cost_layer_gpu(const cost_layer layer, float * input, float * delta) { - copy_ongpu(layer.batch*layer.inputs, layer.delta_cl, 1, delta, 1); + copy_ongpu(layer.batch*layer.inputs, layer.delta_gpu, 1, delta, 1); } #endif diff --git a/src/cost_layer.h b/src/cost_layer.h index 2f1cd55..e58aae1 100644 --- a/src/cost_layer.h +++ b/src/cost_layer.h @@ -1,6 +1,5 @@ #ifndef COST_LAYER_H #define COST_LAYER_H -#include "opencl.h" typedef enum{ SSE, DETECTION @@ -13,7 +12,7 @@ typedef struct { float *output; COST_TYPE type; #ifdef GPU - cl_mem delta_cl; + float * delta_gpu; #endif } cost_layer; @@ -24,8 +23,8 @@ void forward_cost_layer(const cost_layer layer, float *input, float *truth); void backward_cost_layer(const cost_layer layer, float *input, float *delta); #ifdef GPU -void forward_cost_layer_gpu(cost_layer layer, cl_mem input, cl_mem truth); -void backward_cost_layer_gpu(const cost_layer layer, cl_mem input, cl_mem delta); +void forward_cost_layer_gpu(cost_layer layer, float * input, float * truth); +void backward_cost_layer_gpu(const cost_layer layer, float * input, float * delta); #endif #endif diff --git a/src/crop_layer.c b/src/crop_layer.c index 2a5007a..df6eb41 100644 --- a/src/crop_layer.c +++ b/src/crop_layer.c @@ -1,4 +1,5 @@ #include "crop_layer.h" +#include "cuda.h" #include <stdio.h> image get_crop_image(crop_layer layer) @@ -22,7 +23,7 @@ crop_layer *make_crop_layer(int batch, int h, int w, int c, int crop_height, int layer->crop_height = crop_height; layer->output = calloc(crop_width*crop_height * c*batch, sizeof(float)); #ifdef GPU - layer->output_cl = cl_make_array(layer->output, crop_width*crop_height*c*batch); + layer->output_gpu = cuda_make_array(layer->output, crop_width*crop_height*c*batch); #endif return layer; } @@ -53,45 +54,3 @@ void forward_crop_layer(const crop_layer layer, float *input) } } -#ifdef GPU -cl_kernel get_crop_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/crop_layer.cl", "forward", 0); - init = 1; - } - return kernel; -} - -void forward_crop_layer_gpu(crop_layer layer, cl_mem input) -{ - int flip = (layer.flip && rand()%2); - int dh = rand()%(layer.h - layer.crop_height); - int dw = rand()%(layer.w - layer.crop_width); - int size = layer.batch*layer.c*layer.crop_width*layer.crop_height; - - cl_kernel kernel = get_crop_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(input), (void*) &input); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.c), (void*) &layer.c); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.h), (void*) &layer.h); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.w), (void*) &layer.w); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.crop_height), (void*) &layer.crop_height); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.crop_width), (void*) &layer.crop_width); - cl.error = clSetKernelArg(kernel, i++, sizeof(dh), (void*) &dh); - cl.error = clSetKernelArg(kernel, i++, sizeof(dw), (void*) &dw); - cl.error = clSetKernelArg(kernel, i++, sizeof(flip), (void*) &flip); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl); - check_error(cl); - - const size_t global_size[] = {size}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); - check_error(cl); -} - -#endif diff --git a/src/crop_layer.cl b/src/crop_layer.cl deleted file mode 100644 index a61b733..0000000 --- a/src/crop_layer.cl +++ /dev/null @@ -1,16 +0,0 @@ -__kernel void forward(__global float *input, int c, int h, int w, int crop_height, int crop_width, int dh, int dw, int flip, __global float *output) -{ - int id = get_global_id(0); - int count = id; - int j = id % crop_width; - id /= crop_width; - int i = id % crop_height; - id /= crop_height; - int k = id % c; - id /= c; - int b = id; - int col = (flip) ? w - dw - j - 1 : j + dw; - int row = i + dh; - int index = col+w*(row+h*(k + c*b)); - output[count] = input[index]; -} diff --git a/src/crop_layer.h b/src/crop_layer.h index 508487a..4b4ec87 100644 --- a/src/crop_layer.h +++ b/src/crop_layer.h @@ -1,7 +1,6 @@ #ifndef CROP_LAYER_H #define CROP_LAYER_H -#include "opencl.h" #include "image.h" typedef struct { @@ -12,7 +11,7 @@ typedef struct { int flip; float *output; #ifdef GPU - cl_mem output_cl; + float *output_gpu; #endif } crop_layer; @@ -21,7 +20,7 @@ crop_layer *make_crop_layer(int batch, int h, int w, int c, int crop_height, int void forward_crop_layer(const crop_layer layer, float *input); #ifdef GPU -void forward_crop_layer_gpu(crop_layer layer, cl_mem input); +void forward_crop_layer_gpu(crop_layer layer, float *input); #endif #endif diff --git a/src/crop_layer_kernels.cu b/src/crop_layer_kernels.cu new file mode 100644 index 0000000..00ecca5 --- /dev/null +++ b/src/crop_layer_kernels.cu @@ -0,0 +1,41 @@ +extern "C" { +#include "crop_layer.h" +#include "cuda.h" +} + +#define BLOCK 256 + +__global__ void forward_crop_layer_kernel(float *input, int size, int c, int h, int w, int crop_height, int crop_width, int dh, int dw, int flip, float *output) +{ + int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(id >= size) return; + + int count = id; + int j = id % crop_width; + id /= crop_width; + int i = id % crop_height; + id /= crop_height; + int k = id % c; + id /= c; + int b = id; + int col = (flip) ? w - dw - j - 1 : j + dw; + int row = i + dh; + int index = col+w*(row+h*(k + c*b)); + output[count] = input[index]; +} + +extern "C" void forward_crop_layer_gpu(crop_layer layer, float *input) +{ + int flip = (layer.flip && rand()%2); + int dh = rand()%(layer.h - layer.crop_height); + int dw = rand()%(layer.w - layer.crop_width); + int size = layer.batch*layer.c*layer.crop_width*layer.crop_height; + + dim3 dimBlock(BLOCK, 1, 1); + dim3 dimGrid((size-1)/BLOCK + 1, 1, 1); + + forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(input, size, layer.c, layer.h, layer.w, + layer.crop_height, layer.crop_width, dh, dw, flip, layer.output_gpu); + check_error(cudaPeekAtLastError()); +} + diff --git a/src/cuda.c b/src/cuda.c new file mode 100644 index 0000000..27153ea --- /dev/null +++ b/src/cuda.c @@ -0,0 +1,99 @@ +#include "cuda.h" +#include "utils.h" +#include "blas.h" +#include <stdlib.h> + +int gpu_index = 0; + +void check_error(cudaError_t status) +{ + if (status != cudaSuccess) + { + const char *s = cudaGetErrorString(status); + char buffer[256]; + printf("CUDA Error: %s\n", s); + snprintf(buffer, 256, "CUDA Error: %s", s); + error(buffer); + } +} + +dim3 cuda_gridsize(size_t n){ + size_t k = (n-1) / BLOCK + 1; + size_t x = k; + size_t y = 1; + if(x > 65535){ + x = ceil(sqrt(k)); + y = (n-1)/(x*BLOCK) + 1; + } + dim3 d = {x, y, 1}; + //printf("%ld %ld %ld %ld\n", n, x, y, x*y*BLOCK); + return d; +} + +cublasHandle_t blas_handle() +{ + static int init = 0; + static cublasHandle_t handle; + if(!init) { + cublasCreate(&handle); + init = 1; + } + return handle; +} + +float *cuda_make_array(float *x, int n) +{ + float *x_gpu; + size_t size = sizeof(float)*n; + cudaError_t status = cudaMalloc((void **)&x_gpu, size); + check_error(status); + if(x){ + status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); + check_error(status); + } + return x_gpu; +} + +float cuda_compare(float *x_gpu, float *x, int n, char *s) +{ + float *tmp = calloc(n, sizeof(float)); + cuda_pull_array(x_gpu, tmp, n); + //int i; + //for(i = 0; i < n; ++i) printf("%f %f\n", tmp[i], x[i]); + axpy_cpu(n, -1, x, 1, tmp, 1); + float err = dot_cpu(n, tmp, 1, tmp, 1); + printf("Error %s: %f\n", s, sqrt(err/n)); + free(tmp); + return err; +} + +int *cuda_make_int_array(int n) +{ + int *x_gpu; + size_t size = sizeof(int)*n; + cudaError_t status = cudaMalloc((void **)&x_gpu, size); + check_error(status); + return x_gpu; +} + +void cuda_free(float *x_gpu) +{ + cudaError_t status = cudaFree(x_gpu); + check_error(status); +} + +void cuda_push_array(float *x_gpu, float *x, int n) +{ + size_t size = sizeof(float)*n; + cudaError_t status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); + check_error(status); +} + +void cuda_pull_array(float *x_gpu, float *x, int n) +{ + size_t size = sizeof(float)*n; + cudaError_t status = cudaMemcpy(x, x_gpu, size, cudaMemcpyDeviceToHost); + check_error(status); +} + + diff --git a/src/cuda.h b/src/cuda.h new file mode 100644 index 0000000..08c0340 --- /dev/null +++ b/src/cuda.h @@ -0,0 +1,21 @@ +#ifndef CUDA_H +#define CUDA_H + +#define BLOCK 256 + +#include "cuda_runtime.h" +#include "cublas_v2.h" + +extern int gpu_index; + +void check_error(cudaError_t status); +cublasHandle_t blas_handle(); +float *cuda_make_array(float *x, int n); +int *cuda_make_int_array(int n); +void cuda_push_array(float *x_gpu, float *x, int n); +void cuda_pull_array(float *x_gpu, float *x, int n); +void cuda_free(float *x_gpu); +float cuda_compare(float *x_gpu, float *x, int n, char *s); +dim3 cuda_gridsize(size_t n); + +#endif diff --git a/src/data.c b/src/data.c index ec9f9a7..31aca3b 100644 --- a/src/data.c +++ b/src/data.c @@ -239,6 +239,7 @@ void *load_in_thread(void *ptr) { struct load_args a = *(struct load_args*)ptr; *a.d = load_data(a.paths, a.n, a.m, a.labels, a.k, a.h, a.w); + normalize_data_rows(*a.d); free(ptr); return 0; } diff --git a/src/dropout_layer.c b/src/dropout_layer.c index edcb426..3a3e4cb 100644 --- a/src/dropout_layer.c +++ b/src/dropout_layer.c @@ -1,5 +1,6 @@ #include "dropout_layer.h" #include "utils.h" +#include "cuda.h" #include <stdlib.h> #include <stdio.h> @@ -14,8 +15,8 @@ dropout_layer *make_dropout_layer(int batch, int inputs, float probability) layer->rand = calloc(inputs*batch, sizeof(float)); layer->scale = 1./(1.-probability); #ifdef GPU - layer->output_cl = cl_make_array(layer->output, inputs*batch); - layer->rand_cl = cl_make_array(layer->rand, inputs*batch); + layer->output_gpu = cuda_make_array(layer->output, inputs*batch); + layer->rand_gpu = cuda_make_array(layer->rand, inputs*batch); #endif return layer; } @@ -42,61 +43,3 @@ void backward_dropout_layer(dropout_layer layer, float *delta) } } -#ifdef GPU -cl_kernel get_dropout_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/dropout_layer.cl", "yoloswag420blazeit360noscope", 0); - init = 1; - } - return kernel; -} - -void forward_dropout_layer_gpu(dropout_layer layer, cl_mem input) -{ - int j; - int size = layer.inputs*layer.batch; - for(j = 0; j < size; ++j) layer.rand[j] = rand_uniform(); - cl_write_array(layer.rand_cl, layer.rand, layer.inputs*layer.batch); - - cl_kernel kernel = get_dropout_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(input), (void*) &input); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.rand_cl), (void*) &layer.rand_cl); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.probability), (void*) &layer.probability); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.scale), (void*) &layer.scale); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl); - check_error(cl); - - const size_t global_size[] = {size}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); - check_error(cl); -} - -void backward_dropout_layer_gpu(dropout_layer layer, cl_mem delta) -{ - if(!delta) return; - int size = layer.inputs*layer.batch; - - cl_kernel kernel = get_dropout_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(delta), (void*) &delta); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.rand_cl), (void*) &layer.rand_cl); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.probability), (void*) &layer.probability); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.scale), (void*) &layer.scale); - cl.error = clSetKernelArg(kernel, i++, sizeof(delta), (void*) &delta); - check_error(cl); - - const size_t global_size[] = {size}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); - check_error(cl); -} -#endif diff --git a/src/dropout_layer.cl b/src/dropout_layer.cl deleted file mode 100644 index 341b80f..0000000 --- a/src/dropout_layer.cl +++ /dev/null @@ -1,5 +0,0 @@ -__kernel void yoloswag420blazeit360noscope(__global float *input, __global float *rand, float prob, float scale, __global float *output) -{ - int id = get_global_id(0); - output[id] = (rand[id] < prob) ? 0 : input[id]*scale; -} diff --git a/src/dropout_layer.h b/src/dropout_layer.h index 12a819e..55b63ac 100644 --- a/src/dropout_layer.h +++ b/src/dropout_layer.h @@ -1,6 +1,5 @@ #ifndef DROPOUT_LAYER_H #define DROPOUT_LAYER_H -#include "opencl.h" typedef struct{ int batch; @@ -10,8 +9,8 @@ typedef struct{ float *rand; float *output; #ifdef GPU - cl_mem rand_cl; - cl_mem output_cl; + float * rand_gpu; + float * output_gpu; #endif } dropout_layer; @@ -21,8 +20,8 @@ void forward_dropout_layer(dropout_layer layer, float *input); void backward_dropout_layer(dropout_layer layer, float *delta); #ifdef GPU -void forward_dropout_layer_gpu(dropout_layer layer, cl_mem input); -void backward_dropout_layer_gpu(dropout_layer layer, cl_mem delta); +void forward_dropout_layer_gpu(dropout_layer layer, float * input); +void backward_dropout_layer_gpu(dropout_layer layer, float * delta); #endif #endif diff --git a/src/dropout_layer_kernels.cu b/src/dropout_layer_kernels.cu new file mode 100644 index 0000000..371f0dc --- /dev/null +++ b/src/dropout_layer_kernels.cu @@ -0,0 +1,33 @@ +extern "C" { +#include "dropout_layer.h" +#include "cuda.h" +#include "utils.h" +} + +__global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale, float *output) +{ + int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(id < size) output[id] = (rand[id] < prob) ? 0 : input[id]*scale; +} + +extern "C" void forward_dropout_layer_gpu(dropout_layer layer, float * input) +{ + int j; + int size = layer.inputs*layer.batch; + for(j = 0; j < size; ++j) layer.rand[j] = rand_uniform(); + cuda_push_array(layer.rand_gpu, layer.rand, layer.inputs*layer.batch); + + yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(input, size, layer.rand_gpu, layer.probability, + layer.scale, layer.output_gpu); + check_error(cudaPeekAtLastError()); +} + +extern "C" void backward_dropout_layer_gpu(dropout_layer layer, float *delta) +{ + if(!delta) return; + int size = layer.inputs*layer.batch; + + yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(delta, size, layer.rand_gpu, layer.probability, + layer.scale, delta); + check_error(cudaPeekAtLastError()); +} diff --git a/src/gemm.c b/src/gemm.c index 9797b85..a923beb 100644 --- a/src/gemm.c +++ b/src/gemm.c @@ -1,5 +1,44 @@ -#include "mini_blas.h" +#include "gemm.h" #include "utils.h" +#include "cuda.h" +#include <stdlib.h> +#include <stdio.h> +#include <math.h> + +float *random_matrix(int rows, int cols) +{ + int i; + float *m = calloc(rows*cols, sizeof(float)); + for(i = 0; i < rows*cols; ++i){ + m[i] = (float)rand()/RAND_MAX; + } + return m; +} + +void time_random_matrix(int TA, int TB, int m, int k, int n) +{ + float *a; + if(!TA) a = random_matrix(m,k); + else a = random_matrix(k,m); + int lda = (!TA)?k:m; + float *b; + if(!TB) b = random_matrix(k,n); + else b = random_matrix(n,k); + int ldb = (!TB)?n:k; + + float *c = random_matrix(m,n); + int i; + clock_t start = clock(), end; + for(i = 0; i<10; ++i){ + gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); + } + end = clock(); + printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC); + free(a); + free(b); + free(c); +} + void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, @@ -102,176 +141,18 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, #ifdef GPU -#include "opencl.h" #include <math.h> -#ifdef CLBLAS -#include <clBLAS.h> -#endif - -#define STR_HELPER(x) #x -#define STR(x) STR_HELPER(x) - -#ifdef __APPLE__ -#define BLOCK 1 -#else -#define BLOCK 16 -#endif - -cl_kernel get_gemm_kernel() -{ - static int init = 0; - static cl_kernel gemm_kernel; - if(!init){ - gemm_kernel = get_kernel("src/gemm.cl", "gemm", "-D BLOCK=" STR(BLOCK) ); - init = 1; - } - return gemm_kernel; -} - -cl_kernel get_gemm_nt_kernel() -{ - static int init = 0; - static cl_kernel gemm_kernel; - if(!init){ - gemm_kernel = get_kernel("src/gemm.cl", "gemm_nt", "-D BLOCK=" STR(BLOCK) ); - init = 1; - } - return gemm_kernel; -} - -cl_kernel get_gemm_tn_kernel() -{ - static int init = 0; - static cl_kernel gemm_kernel; - if(!init){ - gemm_kernel = get_kernel("src/gemm.cl", "gemm_tn", "-D BLOCK=" STR(BLOCK) ); - init = 1; - } - return gemm_kernel; -} - -cl_kernel get_gemm_nn_kernel() -{ - static int init = 0; - static cl_kernel gemm_kernel; - if(!init){ - gemm_kernel = get_kernel("src/gemm.cl", "gemm_nn", "-D BLOCK=" STR(BLOCK) ); - init = 1; - } - return gemm_kernel; -} - -#define TILE 64 -#define TILE_K 16 -#define THREADS 64 - -cl_kernel get_gemm_nn_fast_kernel() -{ - static int init = 0; - static cl_kernel gemm_kernel; - if(!init){ - gemm_kernel = get_kernel("src/gemm_fast.cl", "gemm_nn_fast", "-D TILE=" STR(TILE) - " -cl-nv-verbose " - " -D TILE_K=" STR(TILE_K) - " -D THREADS=" STR(THREADS)); - init = 1; - } - return gemm_kernel; -} - void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, - cl_mem A_gpu, int lda, - cl_mem B_gpu, int ldb, + float *A_gpu, int lda, + float *B_gpu, int ldb, float BETA, - cl_mem C_gpu, int ldc) + float *C_gpu, int ldc) { - gemm_ongpu_offset(TA, TB, M, N, K, ALPHA, A_gpu, 0, lda, B_gpu, 0, ldb, BETA, C_gpu, 0, ldc); -} - -void gemm_ongpu_fast(int TA, int TB, int M, int N, int K, float ALPHA, - cl_mem A_gpu, int lda, - cl_mem B_gpu, int ldb, - float BETA, - cl_mem C_gpu, int ldc) -{ - int a_off = 0; - int b_off = 0; - int c_off = 0; - //printf("gpu: %d %d %d %d %d\n",TA, TB, M, N, K); - cl_kernel gemm_kernel = get_gemm_nn_fast_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(a_off), (void*) &a_off); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(b_off), (void*) &b_off); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(c_off), (void*) &c_off); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); - check_error(cl); - - const size_t global_size[] = {THREADS*((N-1)/TILE + 1), (M-1)/TILE + 1}; - const size_t local_size[] = {THREADS, 1}; - - cl.error = clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); - check_error(cl); -} - -void gemm_ongpu_offset(int TA, int TB, int M, int N, int K, float ALPHA, - cl_mem A_gpu, int a_off, int lda, - cl_mem B_gpu, int b_off, int ldb, - float BETA, - cl_mem C_gpu, int c_off, int ldc) -{ -#ifdef CLBLAS - cl_command_queue queue = cl.queue; - cl_event event; - cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, a_off, lda,B_gpu, b_off, ldb,BETA, C_gpu, c_off, ldc,1, &queue, 0, NULL, &event); - check_error(cl); -#else - //printf("gpu: %d %d %d %d %d\n",TA, TB, M, N, K); - cl_kernel gemm_kernel = get_gemm_kernel(); - if(!TA && !TB) gemm_kernel = get_gemm_nn_kernel(); - if(!TA && TB) gemm_kernel = get_gemm_nt_kernel(); - if(TA && !TB) gemm_kernel = get_gemm_tn_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(a_off), (void*) &a_off); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(b_off), (void*) &b_off); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(c_off), (void*) &c_off); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); - check_error(cl); - - const size_t global_size[] = {ceil((float)N/BLOCK)*BLOCK, ceil((float)M/BLOCK)*BLOCK}; - const size_t local_size[] = {BLOCK, BLOCK}; - - cl.error = clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); - check_error(cl); -#endif + cublasHandle_t handle = blas_handle(); + cudaError_t status = cublasSgemm(handle, (TB ? CUBLAS_OP_T : CUBLAS_OP_N), + (TA ? CUBLAS_OP_T : CUBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc); + check_error(status); } void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, @@ -280,37 +161,16 @@ void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, float BETA, float *C, int ldc) { - cl_context context = cl.context; - cl_command_queue queue = cl.queue; - - size_t size = sizeof(float)*(TA ? lda*K:lda*M); - cl_mem A_gpu = clCreateBuffer(context, - CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, - size, A, &cl.error); - check_error(cl); - - size = sizeof(float)*(TB ? ldb*N:ldb*K); - cl_mem B_gpu = clCreateBuffer(context, - CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, - size, B, &cl.error); - check_error(cl); - - size = sizeof(float)*(ldc*M); - cl_mem C_gpu = clCreateBuffer(context, - CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, - size, C, &cl.error); - check_error(cl); - - // TODO - //gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); - gemm_ongpu_fast(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); - - clEnqueueReadBuffer(queue, C_gpu, CL_TRUE, 0, size, C, 0, 0, 0); - check_error(cl); - - clReleaseMemObject(A_gpu); - clReleaseMemObject(B_gpu); - clReleaseMemObject(C_gpu); + float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M)); + float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K)); + float *C_gpu = cuda_make_array(C, ldc*M); + + gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); + + cuda_pull_array(C_gpu, C, ldc*M); + cuda_free(A_gpu); + cuda_free(B_gpu); + cuda_free(C_gpu); } #include <stdio.h> @@ -353,60 +213,29 @@ void time_ongpu(int TA, int TB, int m, int k, int n) float *c = random_matrix(m,n); - cl_mem a_cl = cl_make_array(a, m*k); - cl_mem b_cl = cl_make_array(b, k*n); - cl_mem c_cl = cl_make_array(c, m*n); + float *a_cl = cuda_make_array(a, m*k); + float *b_cl = cuda_make_array(b, k*n); + float *c_cl = cuda_make_array(c, m*n); int i; clock_t start = clock(), end; for(i = 0; i<iter; ++i){ gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n); + cudaThreadSynchronize(); } double flop = ((double)m)*n*(2.*k + 2.)*iter; double gflop = flop/pow(10., 9); end = clock(); double seconds = sec(end-start); printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds); - clReleaseMemObject(a_cl); - clReleaseMemObject(b_cl); - clReleaseMemObject(c_cl); + cuda_free(a_cl); + cuda_free(b_cl); + cuda_free(c_cl); free(a); free(b); free(c); } -void time_ongpu_fast(int TA, int TB, int m, int k, int n) -{ - int iter = 10; - float *a = random_matrix(m,k); - float *b = random_matrix(k,n); - - int lda = (!TA)?k:m; - int ldb = (!TB)?n:k; - - float *c = random_matrix(m,n); - - cl_mem a_cl = cl_make_array(a, m*k); - cl_mem b_cl = cl_make_array(b, k*n); - cl_mem c_cl = cl_make_array(c, m*n); - - int i; - clock_t start = clock(), end; - for(i = 0; i<iter; ++i){ - gemm_ongpu_fast(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n); - } - double flop = ((double)m)*n*(2.*k + 2.)*iter; - double gflop = flop/pow(10., 9); - end = clock(); - double seconds = sec(end-start); - printf("Fast Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds); - clReleaseMemObject(a_cl); - clReleaseMemObject(b_cl); - clReleaseMemObject(c_cl); - free(a); - free(b); - free(c); -} void test_gpu_accuracy(int TA, int TB, int m, int k, int n) { @@ -429,6 +258,7 @@ void test_gpu_accuracy(int TA, int TB, int m, int k, int n) gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n); //printf("GPU\n"); //pm(m, n, c_gpu); + gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); //printf("\n\nCPU\n"); //pm(m, n, c); @@ -444,9 +274,8 @@ void test_gpu_accuracy(int TA, int TB, int m, int k, int n) free(c_gpu); } -void test_gpu_blas() +int test_gpu_blas() { - /* test_gpu_accuracy(0,0,10,576,75); test_gpu_accuracy(0,0,17,10,10); @@ -458,73 +287,20 @@ void test_gpu_blas() test_gpu_accuracy(1,0,1000,10,100); test_gpu_accuracy(0,1,1000,10,100); test_gpu_accuracy(1,1,1000,10,100); - */ - test_gpu_accuracy(0,0,128,128,128); + test_gpu_accuracy(0,0,10,10,10); time_ongpu(0,0,64,2916,363); - time_ongpu_fast(0,0,64,2916,363); time_ongpu(0,0,64,2916,363); - time_ongpu_fast(0,0,64,2916,363); time_ongpu(0,0,64,2916,363); - time_ongpu_fast(0,0,64,2916,363); time_ongpu(0,0,192,729,1600); - time_ongpu_fast(0,0,192,729,1600); time_ongpu(0,0,384,196,1728); - time_ongpu_fast(0,0,384,196,1728); time_ongpu(0,0,256,196,3456); - time_ongpu_fast(0,0,256,196,3456); time_ongpu(0,0,256,196,2304); - time_ongpu_fast(0,0,256,196,2304); time_ongpu(0,0,128,4096,12544); - time_ongpu_fast(0,0,128,4096,12544); time_ongpu(0,0,128,4096,4096); - time_ongpu_fast(0,0,128,4096,4096); -// time_ongpu(1,0,2304,196,256); -// time_ongpu_fast(1,0,2304,196,256); -// time_ongpu(0,1,256,2304,196); -// time_ongpu_fast(0,1,256,2304,196); - - time_ongpu(0,0,2048,2048,2048); - time_ongpu_fast(0,0,2048,2048,2048); - time_ongpu(0,0,2048,2048,2048); - time_ongpu_fast(0,0,2048,2048,2048); - time_ongpu(0,0,2048,2048,2048); - time_ongpu_fast(0,0,2048,2048,2048); - - /* - test_gpu_accuracy(0,0,131,4093,1199); - test_gpu_accuracy(0,1,131,4093,1199); - test_gpu_accuracy(1,0,131,4093,1199); - test_gpu_accuracy(1,1,131,4093,1199); - */ - /* - - time_ongpu(0,0,1024,1024,1024); - time_ongpu(0,1,1024,1024,1024); - time_ongpu(1,0,1024,1024,1024); - time_ongpu(1,1,1024,1024,1024); - - time_ongpu(0,0,128,4096,1200); - time_ongpu(0,1,128,4096,1200); - time_ongpu(1,0,128,4096,1200); - time_ongpu(1,1,128,4096,1200); - */ - - /* - time_gpu_random_matrix(0,0,1000,1000,100); - time_random_matrix(0,0,1000,1000,100); - - time_gpu_random_matrix(0,1,1000,1000,100); - time_random_matrix(0,1,1000,1000,100); - - time_gpu_random_matrix(1,0,1000,1000,100); - time_random_matrix(1,0,1000,1000,100); - - time_gpu_random_matrix(1,1,1000,1000,100); - time_random_matrix(1,1,1000,1000,100); - */ +return 0; } #endif diff --git a/src/gemm.cl b/src/gemm.cl deleted file mode 100644 index 63293eb..0000000 --- a/src/gemm.cl +++ /dev/null @@ -1,217 +0,0 @@ -__kernel void gemm_tn(int TA, int TB, int M, int N, int K, float ALPHA, - __global float *A, int a_off, int lda, - __global float *B, int b_off, int ldb, - float BETA, - __global float *C, int c_off, int ldc) -{ - A += a_off; - B += b_off; - C += c_off; - __local float Asub[BLOCK][BLOCK]; - __local float Bsub[BLOCK][BLOCK]; - - int col = get_global_id(0); - int row = get_global_id(1); - - int col_block = get_group_id(0); - int row_block = get_group_id(1); - - col = (col < N) ? col : N - 1; - row = (row < M) ? row : M - 1; - - int x = get_local_id(0); - int y = get_local_id(1); - - int i,j; - - float val = 0; - float orig = C[row*ldc + col]; - - for(i = 0; i < K; i += BLOCK){ - - int arow = y + i; - int acol = x + row_block*BLOCK; - - int brow = y + i; - int bcol = col; - - arow = (arow < K) ? arow : K-1; - acol = (acol < M) ? acol : M-1; - brow = (brow < K) ? brow : K-1; - - int aind = arow*lda + acol; - int bind = brow*ldb + bcol; - - Asub[x][y] = A[aind]; - Bsub[y][x] = B[bind]; - - barrier(CLK_LOCAL_MEM_FENCE); - - for(j = 0; j < BLOCK && i+j<K; ++j){ - val += Asub[y][j]*Bsub[j][x]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - C[row*ldc+col] = ALPHA*val + BETA*orig; -} - -__kernel void gemm_nt(int TA, int TB, int M, int N, int K, float ALPHA, - __global float *A, int a_off, int lda, - __global float *B, int b_off, int ldb, - float BETA, - __global float *C, int c_off, int ldc) -{ - A += a_off; - B += b_off; - C += c_off; - __local float Asub[BLOCK][BLOCK]; - __local float Bsub[BLOCK][BLOCK]; - - - int col = get_global_id(0); - int row = get_global_id(1); - - int col_block = get_group_id(0); - int row_block = get_group_id(1); - - col = (col < N) ? col : N - 1; - row = (row < M) ? row : M - 1; - - int x = get_local_id(0); - int y = get_local_id(1); - - int i,j; - - float val = 0; - float orig = C[row*ldc + col]; - - for(i = 0; i < K; i += BLOCK){ - - int arow = row; - int acol = x + i; - - int brow = col_block*BLOCK + y; - int bcol = x + i; - - brow = (brow < N) ? brow : N-1; - acol = (acol < K) ? acol : K-1; - bcol = (bcol < K) ? bcol : K-1; - - int aind = arow*lda + acol; - int bind = brow*ldb + bcol; - - Asub[y][x] = A[aind]; - Bsub[x][y] = B[bind]; - - barrier(CLK_LOCAL_MEM_FENCE); - - for(j = 0; j < BLOCK && i+j<K; ++j){ - val += Asub[y][j]*Bsub[j][x]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - C[row*ldc+col] = ALPHA*val + BETA*orig; -} - -__kernel void gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA, - __global float *A, int a_off, int lda, - __global float *B, int b_off, int ldb, - float BETA, - __global float *C, int c_off, int ldc) -{ - A += a_off; - B += b_off; - C += c_off; - __local float Asub[BLOCK][BLOCK]; - __local float Bsub[BLOCK][BLOCK]; - - int col = get_global_id(0); - int row = get_global_id(1); - - col = (col < N) ? col : N - 1; - row = (row < M) ? row : M - 1; - - int x = get_local_id(0); - int y = get_local_id(1); - - int i,j; - - float orig = C[row*ldc+col]; - float val = 0; - - for(i = 0; i < K; i += BLOCK){ - - int arow = row; - int acol = x + i; - - int brow = y + i; - int bcol = col; - - acol = (acol < K) ? acol : K-1; - brow = (brow < K) ? brow : K-1; - - int aind = arow*lda + acol; - int bind = brow*ldb + bcol; - - Asub[y][x] = A[aind]; - Bsub[y][x] = B[bind]; - - barrier(CLK_LOCAL_MEM_FENCE); - - for(j = 0; j < BLOCK && i+j<K; ++j){ - val += Asub[y][j]*Bsub[j][x]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - C[row*ldc+col] = ALPHA*val + BETA*orig; -} - -__kernel void gemm(int TA, int TB, int M, int N, int K, float ALPHA, - __global float *A, int a_off, int lda, - __global float *B, int b_off, int ldb, - float BETA, - __global float *C, int c_off, int ldc) -{ - A += a_off; - B += b_off; - C += c_off; - __local float Asub[BLOCK][BLOCK]; - __local float Bsub[BLOCK][BLOCK]; - - float val = 0; - - int row_block = get_group_id(1); - int col_block = get_group_id(0); - - int sub_row = get_local_id(1); - int sub_col = get_local_id(0); - - int row = row_block*BLOCK + sub_row; - int col = col_block*BLOCK + sub_col; - - int i,j; - for(i = 0; i < K; i += BLOCK){ - int arow = row_block*BLOCK + sub_row; - int acol = i + sub_col; - - int brow = i + sub_row; - int bcol = col_block*BLOCK + sub_col; - - if(arow < M && acol < K)Asub[sub_row][sub_col] = TA ? A[arow + acol*lda] : A[arow*lda + acol]; - if(brow < K && bcol < N)Bsub[sub_row][sub_col] = TB ? B[brow + bcol*ldb] : B[brow*ldb + bcol]; - - barrier(CLK_LOCAL_MEM_FENCE); - - for(j = 0; j < BLOCK && i+j<K; ++j){ - val += Asub[sub_row][j]*Bsub[j][sub_col]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if(row < M && col < N){ - C[row*ldc+col] = ALPHA*val + BETA*C[row*ldc+col]; - } -} diff --git a/src/gemm.h b/src/gemm.h new file mode 100644 index 0000000..602919f --- /dev/null +++ b/src/gemm.h @@ -0,0 +1,29 @@ +#ifndef GEMM_H +#define GEMM_H + +void gemm(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc); + +void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc); + +#ifdef GPU +void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, + float *A_gpu, int lda, + float *B_gpu, int ldb, + float BETA, + float *C_gpu, int ldc); + +void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc); +#endif +#endif diff --git a/src/gemm_fast.cl b/src/gemm_fast.cl deleted file mode 100644 index 2a76396..0000000 --- a/src/gemm_fast.cl +++ /dev/null @@ -1,76 +0,0 @@ - -__kernel void gemm_nn_fast(int TA, int TB, int M, int N, int K, float ALPHA, - __global float *A, int a_off, int lda, - __global float *B, int b_off, int ldb, - float BETA, - __global float *C, int c_off, int ldc) -{ - int i, j, k, x, y; - A += a_off; - B += b_off; - C += c_off; - - __local float Asub[TILE] [TILE_K]; - __local float Bsub[TILE_K][TILE]; - - int ctile = get_group_id(0); - int rtile = get_group_id(1); - - float Areg[TILE]; - float acc[TILE][TILE/THREADS]; - - A += rtile*TILE*lda; - B += ctile*TILE; - C += rtile*TILE*ldc + ctile*TILE; - - for(i = 0; i < TILE; ++i){ - for(j = 0; j < TILE/THREADS; ++j){ - acc[i][j] = 0; - } - } - - int offset = get_local_id(0); - - for(i = 0; i < K; i += TILE_K){ - for(j = 0; j < TILE*TILE_K; j += THREADS){ - int index = j+offset; - - int row = index / TILE_K; - int col = index % TILE_K; - Asub[row][col] = A[row*lda + col]; - - row = index / TILE; - col = index % TILE; - Bsub[row][col] = B[row*ldb + col]; - } - - A += TILE_K; - B += TILE_K*ldb; - - barrier(CLK_LOCAL_MEM_FENCE); - - for(k = 0; k < TILE_K; ++k){ - #pragma unroll - for(y = 0; y < TILE; ++y){ - Areg[y] = Asub[y][k]; - } - for(x = 0; x < TILE; x += THREADS){ - float Breg = Bsub[k][x+offset]; - #pragma unroll - for(y = 0; y < TILE; ++y){ - acc[y][x/THREADS] += Breg * Areg[y]; - } - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - for(i = 0; i < TILE; ++i){ - for(j = 0; j < TILE/THREADS; ++j){ - int col = j*THREADS + offset; - int row = i; - C[row*ldc+col] = ALPHA*acc[i][j] + BETA*C[row*ldc+col]; - } - } -} - diff --git a/src/im2col.c b/src/im2col.c index 2af13e9..6970c55 100644 --- a/src/im2col.c +++ b/src/im2col.c @@ -1,4 +1,4 @@ -#include "mini_blas.h" +#include "im2col.h" #include <stdio.h> inline float im2col_get_pixel(float *im, int height, int width, int channels, int row, int col, int channel, int pad) @@ -42,104 +42,3 @@ void im2col_cpu(float* data_im, } } - -#ifdef GPU - -#include "opencl.h" -#include <math.h> - -cl_kernel get_im2col_pad_kernel() -{ - static int init = 0; - static cl_kernel im2col_kernel; - if(!init){ - im2col_kernel = get_kernel("src/im2col.cl", "im2col_pad", 0); - init = 1; - } - return im2col_kernel; -} - -cl_kernel get_im2col_nopad_kernel() -{ - static int init = 0; - static cl_kernel im2col_kernel; - if(!init){ - im2col_kernel = get_kernel("src/im2col.cl", "im2col_nopad", 0); - init = 1; - } - return im2col_kernel; -} - - -void im2col_ongpu(cl_mem data_im, int offset, - int channels, int height, int width, - int ksize, int stride, int pad, cl_mem data_col) -{ - - int height_col = (height - ksize) / stride + 1; - int width_col = (width - ksize) / stride + 1; - int channels_col = channels * ksize * ksize; - cl_kernel kernel = get_im2col_nopad_kernel(); - - if (pad){ - height_col = 1 + (height-1) / stride; - width_col = 1 + (width-1) / stride; - kernel = get_im2col_pad_kernel(); - } - - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(data_im), (void*) &data_im); - cl.error = clSetKernelArg(kernel, i++, sizeof(offset), (void*) &offset); - cl.error = clSetKernelArg(kernel, i++, sizeof(channels), (void*) &channels); - cl.error = clSetKernelArg(kernel, i++, sizeof(height), (void*) &height); - cl.error = clSetKernelArg(kernel, i++, sizeof(width), (void*) &width); - cl.error = clSetKernelArg(kernel, i++, sizeof(ksize), (void*) &ksize); - cl.error = clSetKernelArg(kernel, i++, sizeof(stride), (void*) &stride); - cl.error = clSetKernelArg(kernel, i++, sizeof(data_col), (void*) &data_col); - check_error(cl); - - size_t global_size = channels_col*height_col*width_col; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, - &global_size, 0, 0, 0, 0); - check_error(cl); -} - -/* - void im2col_gpu(float *data_im, - int channels, int height, int width, - int ksize, int stride, int pad, float *data_col) - { - cl_context context = cl.context; - cl_command_queue queue = cl.queue; - - size_t size = sizeof(float)*(channels*height*width*batch); - cl_mem im_gpu = clCreateBuffer(context, - CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, - size, data_im, &cl.error); - check_error(cl); - - int height_col = (height - ksize) / stride + 1; - int width_col = (width - ksize) / stride + 1; - int channels_col = channels * ksize * ksize; - - size = sizeof(float)*(height_col*width_col*channels_col*batch); - cl_mem col_gpu = clCreateBuffer(context, - CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, - size, data_col, &cl.error); - check_error(cl); - - im2col_ongpu(im_gpu, batch, channels, height, width, - ksize, stride, pad, col_gpu); - - clEnqueueReadBuffer(queue, col_gpu, CL_TRUE, 0, size, data_col, 0, 0, 0); - check_error(cl); - - clReleaseMemObject(col_gpu); - clReleaseMemObject(im_gpu); - } - */ - -#endif diff --git a/src/im2col.h b/src/im2col.h new file mode 100644 index 0000000..b939043 --- /dev/null +++ b/src/im2col.h @@ -0,0 +1,15 @@ +#ifndef IM2COL_H +#define IM2COL_H + +void im2col_cpu(float* data_im, + int channels, int height, int width, + int ksize, int stride, int pad, float* data_col); + +#ifdef GPU + +void im2col_ongpu(float *im, int offset, + int channels, int height, int width, + int ksize, int stride, int pad,float *data_col); + +#endif +#endif diff --git a/src/im2col.cl b/src/im2col_kernels.cu similarity index 57% rename from src/im2col.cl rename to src/im2col_kernels.cu index 31f7db6..feaf44d 100644 --- a/src/im2col.cl +++ b/src/im2col_kernels.cu @@ -1,7 +1,11 @@ +extern "C" { +#include "im2col.h" +#include "cuda.h" +} -__kernel void im2col_pad(__global float *im, int offset, +__global__ void im2col_pad_kernel(float *im, int offset, int channels, int height, int width, - int ksize, int stride, __global float *data_col) + int ksize, int stride, float *data_col) { int c,h,w; int height_col = 1 + (height-1) / stride; @@ -10,7 +14,10 @@ __kernel void im2col_pad(__global float *im, int offset, int pad = ksize/2; - int id = get_global_id(0); + int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + int col_size = height_col*width_col*channels_col; + if (id >= col_size) return; + int col_index = id; w = id % width_col; id /= width_col; @@ -19,7 +26,6 @@ __kernel void im2col_pad(__global float *im, int offset, c = id % channels_col; id /= channels_col; - int col_size = height_col*width_col*channels_col; int w_offset = c % ksize; int h_offset = (c / ksize) % ksize; int im_channel = c / ksize / ksize; @@ -32,16 +38,19 @@ __kernel void im2col_pad(__global float *im, int offset, data_col[col_index] = val; } -__kernel void im2col_nopad(__global float *im, int offset, +__global__ void im2col_nopad_kernel(float *im, int offset, int channels, int height, int width, - int ksize, int stride, __global float *data_col) + int ksize, int stride, float *data_col) { int c,h,w; int height_col = (height - ksize) / stride + 1; int width_col = (width - ksize) / stride + 1; int channels_col = channels * ksize * ksize; - int id = get_global_id(0); + int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + int col_size = height_col*width_col*channels_col; + if (id >= col_size) return; + int col_index = id; w = id % width_col; id /= width_col; @@ -50,7 +59,6 @@ __kernel void im2col_nopad(__global float *im, int offset, c = id % channels_col; id /= channels_col; - int col_size = height_col*width_col*channels_col; int w_offset = c % ksize; int h_offset = (c / ksize) % ksize; int im_channel = c / ksize / ksize; @@ -62,3 +70,24 @@ __kernel void im2col_nopad(__global float *im, int offset, data_col[col_index] = val; } + +extern "C" void im2col_ongpu(float *im, int offset, + int channels, int height, int width, + int ksize, int stride, int pad, float *data_col) +{ + + int height_col = (height - ksize) / stride + 1; + int width_col = (width - ksize) / stride + 1; + int channels_col = channels * ksize * ksize; + + if (pad){ + height_col = 1 + (height-1) / stride; + width_col = 1 + (width-1) / stride; + } + + size_t n = channels_col*height_col*width_col; + + if(pad)im2col_pad_kernel<<<cuda_gridsize(n),BLOCK>>>(im, offset, channels, height, width, ksize, stride, data_col); + else im2col_nopad_kernel<<<cuda_gridsize(n),BLOCK>>>(im, offset, channels, height, width, ksize, stride, data_col); + check_error(cudaPeekAtLastError()); +} diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c index c05e939..834ebdb 100644 --- a/src/maxpool_layer.c +++ b/src/maxpool_layer.c @@ -1,4 +1,5 @@ #include "maxpool_layer.h" +#include "cuda.h" #include <stdio.h> image get_maxpool_image(maxpool_layer layer) @@ -32,9 +33,9 @@ maxpool_layer *make_maxpool_layer(int batch, int h, int w, int c, int size, int layer->output = calloc(output_size, sizeof(float)); layer->delta = calloc(output_size, sizeof(float)); #ifdef GPU - layer->indexes_cl = cl_make_int_array(layer->indexes, output_size); - layer->output_cl = cl_make_array(layer->output, output_size); - layer->delta_cl = cl_make_array(layer->delta, output_size); + layer->indexes_gpu = cuda_make_int_array(output_size); + layer->output_gpu = cuda_make_array(layer->output, output_size); + layer->delta_gpu = cuda_make_array(layer->delta, output_size); #endif return layer; } @@ -98,74 +99,3 @@ void backward_maxpool_layer(const maxpool_layer layer, float *delta) } } -#ifdef GPU -cl_kernel get_forward_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/maxpool_layer.cl", "forward", 0); - init = 1; - } - return kernel; -} - -void forward_maxpool_layer_gpu(maxpool_layer layer, cl_mem input) -{ - int h = (layer.h-1)/layer.stride + 1; - int w = (layer.w-1)/layer.stride + 1; - int c = layer.c; - cl_kernel kernel = get_forward_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.h), (void*) &layer.h); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.w), (void*) &layer.w); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.c), (void*) &layer.c); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.stride), (void*) &layer.stride); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.size), (void*) &layer.size); - cl.error = clSetKernelArg(kernel, i++, sizeof(input), (void*) &input); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.indexes_cl), (void*) &layer.indexes_cl); - check_error(cl); - - const size_t global_size[] = {h*w*c*layer.batch}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); - check_error(cl); -} - -cl_kernel get_backward_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/maxpool_layer.cl", "backward", 0); - init = 1; - } - return kernel; -} - -void backward_maxpool_layer_gpu(maxpool_layer layer, cl_mem delta) -{ - cl_kernel kernel = get_backward_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.h), (void*) &layer.h); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.w), (void*) &layer.w); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.c), (void*) &layer.c); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.stride), (void*) &layer.stride); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.size), (void*) &layer.size); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.delta_cl), (void*) &layer.delta_cl); - cl.error = clSetKernelArg(kernel, i++, sizeof(delta), (void*) &delta); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.indexes_cl), (void*) &layer.indexes_cl); - check_error(cl); - - const size_t global_size[] = {layer.h*layer.w*layer.c*layer.batch}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); - check_error(cl); -} - -#endif diff --git a/src/maxpool_layer.h b/src/maxpool_layer.h index dc45c55..516bd31 100644 --- a/src/maxpool_layer.h +++ b/src/maxpool_layer.h @@ -2,7 +2,7 @@ #define MAXPOOL_LAYER_H #include "image.h" -#include "opencl.h" +#include "cuda.h" typedef struct { int batch; @@ -13,9 +13,9 @@ typedef struct { float *delta; float *output; #ifdef GPU - cl_mem indexes_cl; - cl_mem output_cl; - cl_mem delta_cl; + int *indexes_gpu; + float *output_gpu; + float *delta_gpu; #endif } maxpool_layer; @@ -26,8 +26,8 @@ void forward_maxpool_layer(const maxpool_layer layer, float *input); void backward_maxpool_layer(const maxpool_layer layer, float *delta); #ifdef GPU -void forward_maxpool_layer_gpu(maxpool_layer layer, cl_mem input); -void backward_maxpool_layer_gpu(maxpool_layer layer, cl_mem delta); +void forward_maxpool_layer_gpu(maxpool_layer layer, float * input); +void backward_maxpool_layer_gpu(maxpool_layer layer, float * delta); #endif #endif diff --git a/src/maxpool_layer.cl b/src/maxpool_layer_kernels.cu similarity index 57% rename from src/maxpool_layer.cl rename to src/maxpool_layer_kernels.cu index fc793d0..a5c8209 100644 --- a/src/maxpool_layer.cl +++ b/src/maxpool_layer_kernels.cu @@ -1,11 +1,17 @@ +extern "C" { +#include "maxpool_layer.h" +#include "cuda.h" +} -__kernel void forward(int in_h, int in_w, int in_c, int stride, int size, __global float *input, __global float *output, __global int *indexes) +__global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride, int size, float *input, float *output, int *indexes) { int h = (in_h-1)/stride + 1; int w = (in_w-1)/stride + 1; int c = in_c; - int id = get_global_id(0); + int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(id >= n) return; + int j = id % w; id /= w; int i = id % h; @@ -37,14 +43,16 @@ __kernel void forward(int in_h, int in_w, int in_c, int stride, int size, __glob indexes[out_index] = max_i; } -__kernel void backward(int in_h, int in_w, int in_c, int stride, int size, __global float *delta, __global float *prev_delta, __global int *indexes) +__global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride, int size, float *delta, float *prev_delta, int *indexes) { int h = (in_h-1)/stride + 1; int w = (in_w-1)/stride + 1; int c = in_c; int area = (size-1)/stride; - int id = get_global_id(0); + int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(id >= n) return; + int index = id; int j = id % in_w; id /= in_w; @@ -71,3 +79,24 @@ __kernel void backward(int in_h, int in_w, int in_c, int stride, int size, __glo } prev_delta[index] = d; } + +extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, float *input) +{ + int h = (layer.h-1)/layer.stride + 1; + int w = (layer.w-1)/layer.stride + 1; + int c = layer.c; + + size_t n = h*w*c*layer.batch; + + forward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, input, layer.output_gpu, layer.indexes_gpu); + check_error(cudaPeekAtLastError()); +} + +extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, float * delta) +{ + size_t n = layer.h*layer.w*layer.c*layer.batch; + + backward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.delta_gpu, delta, layer.indexes_gpu); + check_error(cudaPeekAtLastError()); +} + diff --git a/src/mini_blas.c b/src/mini_blas.c deleted file mode 100644 index 0c4c37b..0000000 --- a/src/mini_blas.c +++ /dev/null @@ -1,67 +0,0 @@ -#include <stdlib.h> -#include <stdio.h> -#include <math.h> -#include <time.h> -#include <string.h> -#include "mini_blas.h" - -void pm(int M, int N, float *A) -{ - int i,j; - for(i =0 ; i < M; ++i){ - for(j = 0; j < N; ++j){ - printf("%10.6f, ", A[i*N+j]); - } - printf("\n"); - } - printf("\n"); -} - -float *random_matrix(int rows, int cols) -{ - int i; - float *m = calloc(rows*cols, sizeof(float)); - for(i = 0; i < rows*cols; ++i){ - m[i] = (float)rand()/RAND_MAX; - } - return m; -} - -void time_random_matrix(int TA, int TB, int m, int k, int n) -{ - float *a; - if(!TA) a = random_matrix(m,k); - else a = random_matrix(k,m); - int lda = (!TA)?k:m; - float *b; - if(!TB) b = random_matrix(k,n); - else b = random_matrix(n,k); - int ldb = (!TB)?n:k; - - float *c = random_matrix(m,n); - int i; - clock_t start = clock(), end; - for(i = 0; i<10; ++i){ - gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); - } - end = clock(); - printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC); - free(a); - free(b); - free(c); -} - -void test_blas() -{ - - time_random_matrix(0,0,100,100,100); - time_random_matrix(1,0,100,100,100); - time_random_matrix(0,1,100,100,100); - time_random_matrix(1,1,100,100,100); - - time_random_matrix(0,0,1000,100,100); - time_random_matrix(1,0,1000,100,100); - time_random_matrix(0,1,1000,100,100); - time_random_matrix(1,1,1000,100,100); -} - diff --git a/src/mini_blas.h b/src/mini_blas.h deleted file mode 100644 index ea73ed6..0000000 --- a/src/mini_blas.h +++ /dev/null @@ -1,70 +0,0 @@ -#include "opencl.h" - -void pm(int M, int N, float *A); -void gemm(int TA, int TB, int M, int N, int K, float ALPHA, - float *A, int lda, - float *B, int ldb, - float BETA, - float *C, int ldc); -float *random_matrix(int rows, int cols); -void time_random_matrix(int TA, int TB, int m, int k, int n); - -#ifdef GPU -void axpy_ongpu(int N, float ALPHA, cl_mem X, int INCX, cl_mem Y, int INCY); -void axpy_ongpu_offset(int N, float ALPHA, cl_mem X, int OFFX, int INCX, cl_mem Y, int OFFY, int INCY); -void copy_ongpu(int N, cl_mem X, int INCX, cl_mem Y, int INCY); -void copy_ongpu_offset(int N, cl_mem X, int OFFX, int INCX, cl_mem Y, int OFFY, int INCY); -void scal_ongpu(int N, float ALPHA, cl_mem X, int INCX); -void im2col_ongpu(cl_mem data_im, int offset, - int channels, int height, int width, - int ksize, int stride, int pad, cl_mem data_col); - -void col2im_gpu(float *data_col, int offset, - int channels, int height, int width, - int ksize, int stride, int pad, float *data_im); -void col2im_ongpu(cl_mem data_col, int batch, - int channels, int height, int width, - int ksize, int stride, int pad, cl_mem data_im); - -void im2col_gpu(float *data_im, - int channels, int height, int width, - int ksize, int stride, int pad, float *data_col); - -void gemm_ongpu_offset(int TA, int TB, int M, int N, int K, float ALPHA, - cl_mem A_gpu, int a_off, int lda, - cl_mem B_gpu, int b_off, int ldb, - float BETA, - cl_mem C_gpu, int c_off, int ldc); - -void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, - cl_mem A_gpu, int lda, - cl_mem B_gpu, int ldb, - float BETA, - cl_mem C_gpu, int ldc); -#endif - -void im2col_cpu(float* data_im, - int channels, int height, int width, - int ksize, int stride, int pad, float* data_col); - -void col2im_cpu(float* data_col, - int channels, int height, int width, - int ksize, int stride, int pad, float* data_im); - -void test_blas(); - -void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, - float *A, int lda, - float *B, int ldb, - float BETA, - float *C, int ldc); -void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, - float *A, int lda, - float *B, int ldb, - float BETA, - float *C, int ldc); -void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); -void copy_cpu(int N, float *X, int INCX, float *Y, int INCY); -void scal_cpu(int N, float ALPHA, float *X, int INCX); -float dot_cpu(int N, float *X, int INCX, float *Y, int INCY); -void test_gpu_blas(); diff --git a/src/network.c b/src/network.c index 641d782..eb39054 100644 --- a/src/network.c +++ b/src/network.c @@ -53,9 +53,10 @@ network make_network(int n, int batch) net.types = calloc(net.n, sizeof(LAYER_TYPE)); net.outputs = 0; net.output = 0; + net.seen = 0; #ifdef GPU - net.input_cl = calloc(1, sizeof(cl_mem)); - net.truth_cl = calloc(1, sizeof(cl_mem)); + net.input_gpu = calloc(1, sizeof(float *)); + net.truth_gpu = calloc(1, sizeof(float *)); #endif return net; } @@ -107,9 +108,12 @@ void forward_network(network net, float *input, float *truth, int train) } else if(net.types[i] == FREEWEIGHT){ if(!train) continue; - freeweight_layer layer = *(freeweight_layer *)net.layers[i]; - forward_freeweight_layer(layer, input); + //freeweight_layer layer = *(freeweight_layer *)net.layers[i]; + //forward_freeweight_layer(layer, input); } + //char buff[256]; + //sprintf(buff, "layer %d", i); + //cuda_compare(get_network_output_gpu_layer(net, i), input, get_network_output_size_layer(net, i)*net.batch, buff); } } @@ -582,7 +586,7 @@ void top_predictions(network net, int k, int *index) float *network_predict(network net, float *input) { #ifdef GPU - if(gpu_index >= 0) return network_predict_gpu(net, input); + if(gpu_index >= 0) return network_predict_gpu(net, input); #endif forward_network(net, input, 0, 0); diff --git a/src/network.h b/src/network.h index c6c7790..d1f8638 100644 --- a/src/network.h +++ b/src/network.h @@ -2,7 +2,6 @@ #ifndef NETWORK_H #define NETWORK_H -#include "opencl.h" #include "image.h" #include "data.h" @@ -21,6 +20,7 @@ typedef enum { typedef struct { int n; int batch; + int seen; float learning_rate; float momentum; float decay; @@ -30,14 +30,16 @@ typedef struct { float *output; #ifdef GPU - cl_mem *input_cl; - cl_mem *truth_cl; + float **input_gpu; + float **truth_gpu; #endif } network; #ifdef GPU float train_network_datum_gpu(network net, float *x, float *y); float *network_predict_gpu(network net, float *input); +float * get_network_output_gpu_layer(network net, int i); +float * get_network_delta_gpu_layer(network net, int i); #endif void compare_networks(network n1, network n2, data d); diff --git a/src/network_gpu.c b/src/network_kernels.cu similarity index 78% rename from src/network_gpu.c rename to src/network_kernels.cu index c958056..a009174 100644 --- a/src/network_gpu.c +++ b/src/network_kernels.cu @@ -1,3 +1,4 @@ +extern "C" { #include <stdio.h> #include <time.h> @@ -15,12 +16,12 @@ #include "freeweight_layer.h" #include "softmax_layer.h" #include "dropout_layer.h" +} -#ifdef GPU -cl_mem get_network_output_cl_layer(network net, int i); -cl_mem get_network_delta_cl_layer(network net, int i); +extern "C" float * get_network_output_gpu_layer(network net, int i); +extern "C" float * get_network_delta_gpu_layer(network net, int i); -void forward_network_gpu(network net, cl_mem input, cl_mem truth, int train) +void forward_network_gpu(network net, float * input, float * truth, int train) { int i; for(i = 0; i < net.n; ++i){ @@ -28,7 +29,7 @@ void forward_network_gpu(network net, cl_mem input, cl_mem truth, int train) if(net.types[i] == CONVOLUTIONAL){ convolutional_layer layer = *(convolutional_layer *)net.layers[i]; forward_convolutional_layer_gpu(layer, input); - input = layer.output_cl; + input = layer.output_gpu; } else if(net.types[i] == COST){ cost_layer layer = *(cost_layer *)net.layers[i]; @@ -37,47 +38,46 @@ void forward_network_gpu(network net, cl_mem input, cl_mem truth, int train) else if(net.types[i] == CONNECTED){ connected_layer layer = *(connected_layer *)net.layers[i]; forward_connected_layer_gpu(layer, input); - input = layer.output_cl; + input = layer.output_gpu; } else if(net.types[i] == MAXPOOL){ maxpool_layer layer = *(maxpool_layer *)net.layers[i]; forward_maxpool_layer_gpu(layer, input); - input = layer.output_cl; + input = layer.output_gpu; } else if(net.types[i] == SOFTMAX){ softmax_layer layer = *(softmax_layer *)net.layers[i]; forward_softmax_layer_gpu(layer, input); - input = layer.output_cl; + input = layer.output_gpu; } else if(net.types[i] == DROPOUT){ if(!train) continue; dropout_layer layer = *(dropout_layer *)net.layers[i]; forward_dropout_layer_gpu(layer, input); - input = layer.output_cl; + input = layer.output_gpu; } else if(net.types[i] == CROP){ crop_layer layer = *(crop_layer *)net.layers[i]; forward_crop_layer_gpu(layer, input); - input = layer.output_cl; + input = layer.output_gpu; } - check_error(cl); //printf("Forward %d %s %f\n", i, get_layer_string(net.types[i]), sec(clock() - time)); } } -void backward_network_gpu(network net, cl_mem input) +void backward_network_gpu(network net, float * input) { int i; - cl_mem prev_input; - cl_mem prev_delta; + float * prev_input; + float * prev_delta; for(i = net.n-1; i >= 0; --i){ //clock_t time = clock(); if(i == 0){ prev_input = input; prev_delta = 0; }else{ - prev_input = get_network_output_cl_layer(net, i-1); - prev_delta = get_network_delta_cl_layer(net, i-1); + prev_input = get_network_output_gpu_layer(net, i-1); + prev_delta = get_network_delta_gpu_layer(net, i-1); } if(net.types[i] == CONVOLUTIONAL){ convolutional_layer layer = *(convolutional_layer *)net.layers[i]; @@ -103,7 +103,6 @@ void backward_network_gpu(network net, cl_mem input) softmax_layer layer = *(softmax_layer *)net.layers[i]; backward_softmax_layer_gpu(layer, prev_delta); } - check_error(cl); //printf("Backward %d %s %f\n", i, get_layer_string(net.types[i]), sec(clock() - time)); } } @@ -123,54 +122,54 @@ void update_network_gpu(network net) } } -cl_mem get_network_output_cl_layer(network net, int i) +float * get_network_output_gpu_layer(network net, int i) { if(net.types[i] == CONVOLUTIONAL){ convolutional_layer layer = *(convolutional_layer *)net.layers[i]; - return layer.output_cl; + return layer.output_gpu; } else if(net.types[i] == CONNECTED){ connected_layer layer = *(connected_layer *)net.layers[i]; - return layer.output_cl; + return layer.output_gpu; } else if(net.types[i] == MAXPOOL){ maxpool_layer layer = *(maxpool_layer *)net.layers[i]; - return layer.output_cl; + return layer.output_gpu; } else if(net.types[i] == CROP){ crop_layer layer = *(crop_layer *)net.layers[i]; - return layer.output_cl; + return layer.output_gpu; } else if(net.types[i] == SOFTMAX){ softmax_layer layer = *(softmax_layer *)net.layers[i]; - return layer.output_cl; + return layer.output_gpu; } else if(net.types[i] == DROPOUT){ dropout_layer layer = *(dropout_layer *)net.layers[i]; - return layer.output_cl; + return layer.output_gpu; } return 0; } -cl_mem get_network_delta_cl_layer(network net, int i) +float * get_network_delta_gpu_layer(network net, int i) { if(net.types[i] == CONVOLUTIONAL){ convolutional_layer layer = *(convolutional_layer *)net.layers[i]; - return layer.delta_cl; + return layer.delta_gpu; } else if(net.types[i] == CONNECTED){ connected_layer layer = *(connected_layer *)net.layers[i]; - return layer.delta_cl; + return layer.delta_gpu; } else if(net.types[i] == MAXPOOL){ maxpool_layer layer = *(maxpool_layer *)net.layers[i]; - return layer.delta_cl; + return layer.delta_gpu; } else if(net.types[i] == SOFTMAX){ softmax_layer layer = *(softmax_layer *)net.layers[i]; - return layer.delta_cl; + return layer.delta_gpu; } else if(net.types[i] == DROPOUT){ if(i == 0) return 0; - return get_network_delta_cl_layer(net, i-1); + return get_network_delta_gpu_layer(net, i-1); } return 0; } @@ -179,15 +178,15 @@ float train_network_datum_gpu(network net, float *x, float *y) { int x_size = get_network_input_size(net)*net.batch; int y_size = get_network_output_size(net)*net.batch; - if(!*net.input_cl){ - *net.input_cl = cl_make_array(x, x_size); - *net.truth_cl = cl_make_array(y, y_size); + if(!*net.input_gpu){ + *net.input_gpu = cuda_make_array(x, x_size); + *net.truth_gpu = cuda_make_array(y, y_size); }else{ - cl_write_array(*net.input_cl, x, x_size); - cl_write_array(*net.truth_cl, y, y_size); + cuda_push_array(*net.input_gpu, x, x_size); + cuda_push_array(*net.truth_gpu, y, y_size); } - forward_network_gpu(net, *net.input_cl, *net.truth_cl, 1); - backward_network_gpu(net, *net.input_cl); + forward_network_gpu(net, *net.input_gpu, *net.truth_gpu, 1); + backward_network_gpu(net, *net.input_gpu); update_network_gpu(net); float error = get_network_cost(net); return error; @@ -201,7 +200,7 @@ float *get_network_output_layer_gpu(network net, int i) } else if(net.types[i] == CONNECTED){ connected_layer layer = *(connected_layer *)net.layers[i]; - cl_read_array(layer.output_cl, layer.output, layer.outputs*layer.batch); + cuda_pull_array(layer.output_gpu, layer.output, layer.outputs*layer.batch); return layer.output; } else if(net.types[i] == MAXPOOL){ @@ -227,11 +226,10 @@ float *network_predict_gpu(network net, float *input) { int size = get_network_input_size(net) * net.batch; - cl_mem input_cl = cl_make_array(input, size); - forward_network_gpu(net, input_cl, 0, 0); + float * input_gpu = cuda_make_array(input, size); + forward_network_gpu(net, input_gpu, 0, 0); float *out = get_network_output_gpu(net); - clReleaseMemObject(input_cl); + cuda_free(input_gpu); return out; } -#endif diff --git a/src/opencl.c b/src/opencl.c deleted file mode 100644 index 8b693db..0000000 --- a/src/opencl.c +++ /dev/null @@ -1,222 +0,0 @@ -int gpu_index; -#ifdef GPU -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <time.h> -#include <unistd.h> - -#ifdef CLBLAS -#include <clBLAS.h> -#endif - -#include "opencl.h" -#include "utils.h" -#include "activations.h" - -cl_info cl = {0}; - -void check_error(cl_info info) -{ - clFinish(cl.queue); - if (info.error != CL_SUCCESS) { - printf("\n Error number %d", info.error); - abort(); - exit(1); - } -} - -#define MAX_DEVICES 10 - -cl_info cl_init(int index) -{ - cl_info info; - info.initialized = 0; - if(index < 0) error("Won't initialize negative gpu id\n"); - cl_uint num_platforms, num_devices; - // Fetch the Platform and Device IDs; we only want one. - cl_device_id devices[MAX_DEVICES]; - info.error=clGetPlatformIDs(1, &info.platform, &num_platforms); - check_error(info); - - info.error=clGetDeviceIDs(info.platform, CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &num_devices); - check_error(info); - - index = index%num_devices; - info.device = devices[index]; - check_error(info); - - cl_context_properties properties[]={ - CL_CONTEXT_PLATFORM, (cl_context_properties)info.platform, 0}; - - // Note that nVidia's OpenCL requires the platform property - info.context=clCreateContext(properties, 1, &info.device, 0, 0, &info.error); - check_error(info); - - info.queue = clCreateCommandQueue(info.context, info.device, 0, &info.error); - check_error(info); -#ifdef CLBLAS - info.error = clblasSetup(); -#endif - check_error(info); - info.initialized = 1; - -#ifdef VERBOSE - printf("=== %d OpenCL platform(s) found: ===\n", num_platforms); - char buffer[10240]; - clGetPlatformInfo(info.platform, CL_PLATFORM_PROFILE, 10240, buffer, NULL); - printf(" PROFILE = %s\n", buffer); - clGetPlatformInfo(info.platform, CL_PLATFORM_VERSION, 10240, buffer, NULL); - printf(" VERSION = %s\n", buffer); - clGetPlatformInfo(info.platform, CL_PLATFORM_NAME, 10240, buffer, NULL); - printf(" NAME = %s\n", buffer); - clGetPlatformInfo(info.platform, CL_PLATFORM_VENDOR, 10240, buffer, NULL); - printf(" VENDOR = %s\n", buffer); - clGetPlatformInfo(info.platform, CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL); - printf(" EXTENSIONS = %s\n", buffer); - check_error(info); - - if(num_devices > MAX_DEVICES) num_devices = MAX_DEVICES; - printf("=== %d OpenCL device(s) found on platform:\n", num_devices); - int i; - for (i=0; i<num_devices; i++){ - char buffer[10240]; - cl_uint buf_uint; - cl_ulong buf_ulong; - printf(" -- %d --\n", i); - clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL); - printf(" DEVICE_NAME = %s\n", buffer); - clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL); - printf(" DEVICE_VENDOR = %s\n", buffer); - clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL); - printf(" DEVICE_VERSION = %s\n", buffer); - clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL); - printf(" DRIVER_VERSION = %s\n", buffer); - clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL); - printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); - clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL); - printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); - clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL); - printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); - clGetDeviceInfo(devices[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(buf_ulong), &buf_ulong, NULL); - printf(" DEVICE_MAX_MEM_ALLOC_SIZE = %llu\n", (unsigned long long)buf_ulong); - clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(buf_ulong), &buf_ulong, NULL); - printf(" DEVICE_MAX_WORK_GROUP_SIZE = %llu\n", (unsigned long long)buf_ulong); - cl_uint items; - clGetDeviceInfo( devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), - &items, NULL); - printf(" DEVICE_MAX_WORK_ITEM_DIMENSIONS = %u\n", (unsigned int)items); - size_t workitem_size[10]; - clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, 10*sizeof(workitem_size), workitem_size, NULL); - printf(" DEVICE_MAX_WORK_ITEM_SIZES = %u / %u / %u \n", (unsigned int)workitem_size[0], (unsigned int)workitem_size[1], (unsigned int)workitem_size[2]); - printf("%d devices, %d index\n", num_devices, index); - - } -#endif - return info; -} - -cl_program cl_fprog(char *filename, char *options, cl_info info) -{ - size_t srcsize; - char src[64*1024]; - memset(src, 0, 64*1024); - FILE *fil=fopen(filename,"r"); - if(fil == 0) file_error(filename); - srcsize=fread(src, sizeof src, 1, fil); - fclose(fil); - const char *srcptr[]={src}; - // Submit the source code of the example kernel to OpenCL - cl_program prog=clCreateProgramWithSource(info.context,1, srcptr, &srcsize, &info.error); - check_error(info); - char build_c[1024*64]; - // and compile it (after this we could extract the compiled version) - info.error=clBuildProgram(prog, 0, 0, options, 0, 0); - //if ( info.error != CL_SUCCESS ) { - fprintf(stderr, "Error Building Program: %d\n", info.error); - clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 1024*64, build_c, 0); - fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c); - //} - check_error(info); - return prog; -} - -void cl_setup() -{ - if(!cl.initialized){ - fprintf(stderr, "Initializing OpenCL\n"); - cl = cl_init(gpu_index); - } -} - -cl_kernel get_kernel(char *filename, char *kernelname, char *options) -{ - cl_program prog = cl_fprog(filename, options, cl); - cl_kernel kernel=clCreateKernel(prog, kernelname, &cl.error); - check_error(cl); - return kernel; -} - -void cl_read_array(cl_mem mem, float *x, int n) -{ - if(gpu_index < 0) return; - cl.error = clEnqueueReadBuffer(cl.queue, mem, CL_TRUE, 0, sizeof(float)*n,x,0,0,0); - check_error(cl); -} - -float cl_checksum(cl_mem mem, int n) -{ - - float *x = calloc(n, sizeof(float)); - cl_read_array(mem, x, n); - float sum = sum_array(x, n); - free(x); - return sum; -} - -void cl_write_array(cl_mem mem, float *x, int n) -{ - if(gpu_index < 0) return; - cl.error = clEnqueueWriteBuffer(cl.queue, mem, CL_TRUE, 0,sizeof(float)*n,x,0,0,0); - check_error(cl); -} - -void cl_copy_array(cl_mem src, cl_mem dst, int n) -{ - cl.error = clEnqueueCopyBuffer(cl.queue, src, dst, 0, 0, sizeof(float)*n,0,0,0); - check_error(cl); -} - -cl_mem cl_sub_array(cl_mem src, int offset, int size) -{ - cl_buffer_region r; - r.origin = offset*sizeof(float); - r.size = size*sizeof(float); - cl_mem sub = clCreateSubBuffer(src, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &r, &cl.error); - check_error(cl); - return sub; -} - - -cl_mem cl_make_array(float *x, int n) -{ - if(gpu_index < 0) return 0; - cl_mem mem = clCreateBuffer(cl.context, - CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, - sizeof(float)*n, x, &cl.error); - check_error(cl); - //activate_array_ongpu(mem, n, LINEAR); - return mem; -} - -cl_mem cl_make_int_array(int *x, int n) -{ - if(gpu_index < 0) return 0; - cl_mem mem = clCreateBuffer(cl.context, - CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, - sizeof(int)*n, x, &cl.error); - check_error(cl); - return mem; -} - -#endif diff --git a/src/opencl.h b/src/opencl.h deleted file mode 100644 index ceb6e56..0000000 --- a/src/opencl.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef OPENCL_H -#define OPENCL_H -extern int gpu_index; -#ifdef GPU -#ifdef __APPLE__ -#include <OpenCL/opencl.h> -#else -#include <CL/cl.h> -#endif - - -typedef struct { - int initialized; - cl_int error; - cl_platform_id platform; - cl_device_id device; - cl_context context; - cl_command_queue queue; -}cl_info; - -extern cl_info cl; - -void cl_setup(); -void check_error(cl_info info); -cl_kernel get_kernel(char *filename, char *kernelname, char *options); -void cl_read_array(cl_mem mem, float *x, int n); -void cl_write_array(cl_mem mem, float *x, int n); -cl_mem cl_make_array(float *x, int n); -cl_mem cl_make_int_array(int *x, int n); -void cl_copy_array(cl_mem src, cl_mem dst, int n); -cl_mem cl_sub_array(cl_mem src, int offset, int size); -float cl_checksum(cl_mem mem, int n); -#endif -#endif diff --git a/src/parser.c b/src/parser.c index 768f48b..a00feec 100644 --- a/src/parser.c +++ b/src/parser.c @@ -16,7 +16,6 @@ #include "list.h" #include "option_list.h" #include "utils.h" -#include "opencl.h" typedef struct{ char *type; @@ -87,6 +86,7 @@ convolutional_layer *parse_convolutional(list *options, network *net, int count) net->learning_rate = learning_rate; net->momentum = momentum; net->decay = decay; + net->seen = option_find_int(options, "seen",0); }else{ learning_rate = option_find_float_quiet(options, "learning_rate", net->learning_rate); momentum = option_find_float_quiet(options, "momentum", net->momentum); @@ -149,6 +149,7 @@ softmax_layer *parse_softmax(list *options, network *net, int count) if(count == 0){ input = option_find_int(options, "input",1); net->batch = option_find_int(options, "batch",1); + net->seen = option_find_int(options, "seen",0); }else{ input = get_network_output_size_layer(*net, count-1); } @@ -163,6 +164,7 @@ cost_layer *parse_cost(list *options, network *net, int count) if(count == 0){ input = option_find_int(options, "input",1); net->batch = option_find_int(options, "batch",1); + net->seen = option_find_int(options, "seen",0); }else{ input = get_network_output_size_layer(*net, count-1); } @@ -191,6 +193,7 @@ crop_layer *parse_crop(list *options, network *net, int count) net->learning_rate = learning_rate; net->momentum = momentum; net->decay = decay; + net->seen = option_find_int(options, "seen",0); }else{ image m = get_network_image_layer(*net, count-1); h = m.h; @@ -213,6 +216,7 @@ maxpool_layer *parse_maxpool(list *options, network *net, int count) w = option_find_int(options, "width",1); c = option_find_int(options, "channels",1); net->batch = option_find_int(options, "batch",1); + net->seen = option_find_int(options, "seen",0); }else{ image m = get_network_image_layer(*net, count-1); h = m.h; @@ -225,6 +229,7 @@ maxpool_layer *parse_maxpool(list *options, network *net, int count) return layer; } +/* freeweight_layer *parse_freeweight(list *options, network *net, int count) { int input; @@ -238,6 +243,7 @@ freeweight_layer *parse_freeweight(list *options, network *net, int count) option_unused(options); return layer; } +*/ dropout_layer *parse_dropout(list *options, network *net, int count) { @@ -252,6 +258,7 @@ dropout_layer *parse_dropout(list *options, network *net, int count) net->learning_rate = learning_rate; net->momentum = momentum; net->decay = decay; + net->seen = option_find_int(options, "seen",0); }else{ input = get_network_output_size_layer(*net, count-1); } @@ -272,6 +279,7 @@ normalization_layer *parse_normalization(list *options, network *net, int count) w = option_find_int(options, "width",1); c = option_find_int(options, "channels",1); net->batch = option_find_int(options, "batch",1); + net->seen = option_find_int(options, "seen",0); }else{ image m = get_network_image_layer(*net, count-1); h = m.h; @@ -327,9 +335,10 @@ network parse_network_cfg(char *filename) net.types[count] = DROPOUT; net.layers[count] = layer; }else if(is_freeweight(s)){ - freeweight_layer *layer = parse_freeweight(options, &net, count); - net.types[count] = FREEWEIGHT; - net.layers[count] = layer; + //freeweight_layer *layer = parse_freeweight(options, &net, count); + //net.types[count] = FREEWEIGHT; + //net.layers[count] = layer; + fprintf(stderr, "Type not recognized: %s\n", s->type); }else{ fprintf(stderr, "Type not recognized: %s\n", s->type); } @@ -442,7 +451,7 @@ list *read_cfg(char *filename) void print_convolutional_cfg(FILE *fp, convolutional_layer *l, network net, int count) { #ifdef GPU - if(gpu_index >= 0) pull_convolutional_layer(*l); + if(gpu_index >= 0) pull_convolutional_layer(*l); #endif int i; fprintf(fp, "[convolutional]\n"); @@ -453,8 +462,9 @@ void print_convolutional_cfg(FILE *fp, convolutional_layer *l, network net, int "channels=%d\n" "learning_rate=%g\n" "momentum=%g\n" - "decay=%g\n", - l->batch,l->h, l->w, l->c, l->learning_rate, l->momentum, l->decay); + "decay=%g\n" + "seen=%d\n", + l->batch,l->h, l->w, l->c, l->learning_rate, l->momentum, l->decay, net.seen); } else { if(l->learning_rate != net.learning_rate) fprintf(fp, "learning_rate=%g\n", l->learning_rate); @@ -508,8 +518,9 @@ void print_connected_cfg(FILE *fp, connected_layer *l, network net, int count) "input=%d\n" "learning_rate=%g\n" "momentum=%g\n" - "decay=%g\n", - l->batch, l->inputs, l->learning_rate, l->momentum, l->decay); + "decay=%g\n" + "seen=%d\n", + l->batch, l->inputs, l->learning_rate, l->momentum, l->decay, net.seen); } else { if(l->learning_rate != net.learning_rate) fprintf(fp, "learning_rate=%g\n", l->learning_rate); @@ -540,8 +551,9 @@ void print_crop_cfg(FILE *fp, crop_layer *l, network net, int count) "channels=%d\n" "learning_rate=%g\n" "momentum=%g\n" - "decay=%g\n", - l->batch,l->h, l->w, l->c, net.learning_rate, net.momentum, net.decay); + "decay=%g\n" + "seen=%d\n", + l->batch,l->h, l->w, l->c, net.learning_rate, net.momentum, net.decay, net.seen); } fprintf(fp, "crop_height=%d\ncrop_width=%d\nflip=%d\n\n", l->crop_height, l->crop_width, l->flip); } diff --git a/src/softmax_layer.c b/src/softmax_layer.c index ffc028f..aa5ab06 100644 --- a/src/softmax_layer.c +++ b/src/softmax_layer.c @@ -1,5 +1,6 @@ #include "softmax_layer.h" -#include "mini_blas.h" +#include "blas.h" +#include "cuda.h" #include <float.h> #include <math.h> #include <stdlib.h> @@ -15,8 +16,8 @@ softmax_layer *make_softmax_layer(int batch, int inputs) layer->delta = calloc(inputs*batch, sizeof(float)); layer->jacobian = calloc(inputs*inputs*batch, sizeof(float)); #ifdef GPU - layer->output_cl = cl_make_array(layer->output, inputs*batch); - layer->delta_cl = cl_make_array(layer->delta, inputs*batch); + layer->output_gpu = cuda_make_array(layer->output, inputs*batch); + layer->delta_gpu = cuda_make_array(layer->delta, inputs*batch); #endif return layer; } @@ -49,71 +50,3 @@ void backward_softmax_layer(const softmax_layer layer, float *delta) } } -#ifdef GPU - -void pull_softmax_layer_output(const softmax_layer layer) -{ - cl_read_array(layer.output_cl, layer.output, layer.inputs*layer.batch); -} - -cl_kernel get_softmax_forward_kernel() -{ - static int init = 0; - static cl_kernel kernel; - if(!init){ - kernel = get_kernel("src/softmax_layer.cl", "forward", 0); - init = 1; - } - return kernel; -} - -void forward_softmax_layer_gpu(const softmax_layer layer, cl_mem input) -{ - cl_kernel kernel = get_softmax_forward_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.inputs), (void*) &layer.inputs); - cl.error = clSetKernelArg(kernel, i++, sizeof(input), (void*) &input); - cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl); - check_error(cl); - - const size_t global_size[] = {layer.batch}; - - cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); - check_error(cl); - - /* - cl_read_array(layer.output_cl, layer.output, layer.inputs*layer.batch); - int z; - for(z = 0; z < layer.inputs*layer.batch; ++z) printf("%f,",layer.output[z]); - */ -} - -void backward_softmax_layer_gpu(const softmax_layer layer, cl_mem delta) -{ - copy_ongpu(layer.batch*layer.inputs, layer.delta_cl, 1, delta, 1); -} -#endif - -/* This is if you want softmax w/o log-loss classification. You probably don't. - int i,j,b; - for(b = 0; b < layer.batch; ++b){ - for(i = 0; i < layer.inputs; ++i){ - for(j = 0; j < layer.inputs; ++j){ - int d = (i==j); - layer.jacobian[b*layer.inputs*layer.inputs + i*layer.inputs + j] = - layer.output[b*layer.inputs + i] * (d - layer.output[b*layer.inputs + j]); - } - } - } - for(b = 0; b < layer.batch; ++b){ - int M = layer.inputs; - int N = 1; - int K = layer.inputs; - float *A = layer.jacobian + b*layer.inputs*layer.inputs; - float *B = layer.delta + b*layer.inputs; - float *C = delta + b*layer.inputs; - gemm(0,0,M,N,K,1,A,K,B,N,0,C,N); - } - */ diff --git a/src/softmax_layer.cl b/src/softmax_layer.cl deleted file mode 100644 index 77da521..0000000 --- a/src/softmax_layer.cl +++ /dev/null @@ -1,21 +0,0 @@ - -__kernel void forward(int n, __global float *input, __global float *output) -{ - int b = get_global_id(0); - - int i; - float sum = 0; - float largest = -INFINITY; - for(i = 0; i < n; ++i){ - int val = input[i+b*n]; - largest = (val>largest) ? val : largest; - } - for(i = 0; i < n; ++i){ - sum += exp(input[i+b*n]-largest); - } - sum = (sum != 0) ? largest+log(sum) : largest-100; - for(i = 0; i < n; ++i){ - output[i+b*n] = exp(input[i+b*n]-sum); - } -} - diff --git a/src/softmax_layer.h b/src/softmax_layer.h index c8ebddf..0cc9d53 100644 --- a/src/softmax_layer.h +++ b/src/softmax_layer.h @@ -1,8 +1,6 @@ #ifndef SOFTMAX_LAYER_H #define SOFTMAX_LAYER_H -#include "opencl.h" - typedef struct { int inputs; int batch; @@ -10,8 +8,8 @@ typedef struct { float *output; float *jacobian; #ifdef GPU - cl_mem delta_cl; - cl_mem output_cl; + float * delta_gpu; + float * output_gpu; #endif } softmax_layer; @@ -21,8 +19,8 @@ void backward_softmax_layer(const softmax_layer layer, float *delta); #ifdef GPU void pull_softmax_layer_output(const softmax_layer layer); -void forward_softmax_layer_gpu(const softmax_layer layer, cl_mem input); -void backward_softmax_layer_gpu(const softmax_layer layer, cl_mem delta); +void forward_softmax_layer_gpu(const softmax_layer layer, float *input); +void backward_softmax_layer_gpu(const softmax_layer layer, float *delta); #endif #endif diff --git a/src/softmax_layer_kernels.cu b/src/softmax_layer_kernels.cu new file mode 100644 index 0000000..61dc607 --- /dev/null +++ b/src/softmax_layer_kernels.cu @@ -0,0 +1,72 @@ +extern "C" { +#include "softmax_layer.h" +#include "cuda.h" +#include "blas.h" +} + +#define BLOCK 256 + +__global__ void forward_softmax_layer_kernel(int n, int batch, float *input, float *output) +{ + int b = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(b >= batch) return; + + int i; + float sum = 0; + float largest = -INFINITY; + for(i = 0; i < n; ++i){ + int val = input[i+b*n]; + largest = (val>largest) ? val : largest; + } + for(i = 0; i < n; ++i){ + sum += exp(input[i+b*n]-largest); + } + sum = (sum != 0) ? largest+log(sum) : largest-100; + for(i = 0; i < n; ++i){ + output[i+b*n] = exp(input[i+b*n]-sum); + } +} + +extern "C" void pull_softmax_layer_output(const softmax_layer layer) +{ + cuda_pull_array(layer.output_gpu, layer.output, layer.inputs*layer.batch); +} + +extern "C" void forward_softmax_layer_gpu(const softmax_layer layer, float *input) +{ + forward_softmax_layer_kernel<<<cuda_gridsize(layer.batch), BLOCK>>>(layer.inputs, layer.batch, input, layer.output_gpu); + check_error(cudaPeekAtLastError()); + + /* + cl_read_array(layer.output_cl, layer.output, layer.inputs*layer.batch); + int z; + for(z = 0; z < layer.inputs*layer.batch; ++z) printf("%f,",layer.output[z]); + */ +} + +extern "C" void backward_softmax_layer_gpu(const softmax_layer layer, float *delta) +{ + copy_ongpu(layer.batch*layer.inputs, layer.delta_gpu, 1, delta, 1); +} + +/* This is if you want softmax w/o log-loss classification. You probably don't. + int i,j,b; + for(b = 0; b < layer.batch; ++b){ + for(i = 0; i < layer.inputs; ++i){ + for(j = 0; j < layer.inputs; ++j){ + int d = (i==j); + layer.jacobian[b*layer.inputs*layer.inputs + i*layer.inputs + j] = + layer.output[b*layer.inputs + i] * (d - layer.output[b*layer.inputs + j]); + } + } + } + for(b = 0; b < layer.batch; ++b){ + int M = layer.inputs; + int N = 1; + int K = layer.inputs; + float *A = layer.jacobian + b*layer.inputs*layer.inputs; + float *B = layer.delta + b*layer.inputs; + float *C = delta + b*layer.inputs; + gemm(0,0,M,N,K,1,A,K,B,N,0,C,N); + } + */ diff --git a/src/utils.c b/src/utils.c index 28be2ec..a4071e2 100644 --- a/src/utils.c +++ b/src/utils.c @@ -7,6 +7,19 @@ #include "utils.h" +void pm(int M, int N, float *A) +{ + int i,j; + for(i =0 ; i < M; ++i){ + for(j = 0; j < N; ++j){ + printf("%10.6f, ", A[i*N+j]); + } + printf("\n"); + } + printf("\n"); +} + + char *find_replace(char *str, char *orig, char *rep) { static char buffer[4096]; @@ -44,10 +57,9 @@ void top_k(float *a, int n, int k, int *index) } } -void error(char *s) +void error(const char *s) { perror(s); - //fprintf(stderr, "Error: %s\n", s); exit(0); } diff --git a/src/utils.h b/src/utils.h index 5ddc538..ee26d35 100644 --- a/src/utils.h +++ b/src/utils.h @@ -5,7 +5,7 @@ #include "list.h" char *find_replace(char *str, char *orig, char *rep); -void error(char *s); +void error(const char *s); void malloc_error(); void file_error(char *s); void strip(char *s); -- GitLab