Skip to content
Snippets Groups Projects
Commit 5ef74c20 authored by Joseph Redmon's avatar Joseph Redmon
Browse files

Slowly refactoring and pushing to GPU

parent 00d48369
No related branches found
No related tags found
No related merge requests found
CC=gcc
GPU=1
COMMON=-Wall `pkg-config --cflags opencv` -I/usr/local/cuda/include/
COMMON=-Wall -Werror -Wfatal-errors `pkg-config --cflags opencv` -I/usr/local/cuda/include/
ifeq ($(GPU), 1)
COMMON+=-DGPU
else
endif
UNAME = $(shell uname)
OPTS=-O3
OPTS=-O3 -flto
ifeq ($(UNAME), Darwin)
COMMON+= -isystem /usr/local/Cellar/opencv/2.4.6.1/include/opencv -isystem /usr/local/Cellar/opencv/2.4.6.1/include
ifeq ($(GPU), 1)
LDFLAGS= -framework OpenCL
endif
else
OPTS+= -march=native
ifeq ($(GPU), 1)
LDFLAGS= -lOpenCL
endif
endif
CFLAGS= $(COMMON) $(OPTS)
CFLAGS= $(COMMON) -O0 -g
#CFLAGS= $(COMMON) -O0 -g
LDFLAGS+=`pkg-config --libs opencv` -lm
VPATH=./src/
EXEC=cnn
OBJ=network.o image.o tests.o connected_layer.o maxpool_layer.o activations.o list.o option_list.o parser.o utils.o data.o matrix.o softmax_layer.o mini_blas.o convolutional_layer.o cpu_gemm.o normalization_layer.o
ifeq ($(GPU), 1)
OBJ+=gpu_gemm.o opencl.o
endif
OBJ=network.o image.o tests.o connected_layer.o maxpool_layer.o activations.o list.o option_list.o parser.o utils.o data.o matrix.o softmax_layer.o mini_blas.o convolutional_layer.o gemm.o normalization_layer.o opencl.o im2col.o col2im.o axpy.o
all: $(EXEC)
......
......@@ -100,7 +100,7 @@ void forward_convolutional_layer(const convolutional_layer layer, float *in)
float *b = layer.col_image;
float *c = layer.output;
for(i = 0; i < layer.batch; ++i){
im2col_cpu(in+i*(n/layer.batch), layer.c, layer.h, layer.w, layer.size, layer.stride, b+i*(n/layer.batch));
im2col_gpu(in+i*(n/layer.batch), layer.c, layer.h, layer.w, layer.size, layer.stride, b+i*(n/layer.batch));
}
gemm(0,0,m,n,k,1,a,k,b,n,0,c,n);
activate_array(layer.output, m*n, layer.activation);
......@@ -162,16 +162,13 @@ void backward_convolutional_layer(convolutional_layer layer, float *delta)
void update_convolutional_layer(convolutional_layer layer, float step, float momentum, float decay)
{
int i;
int size = layer.size*layer.size*layer.c*layer.n;
for(i = 0; i < layer.n; ++i){
layer.biases[i] += step*layer.bias_updates[i];
layer.bias_updates[i] *= momentum;
}
for(i = 0; i < size; ++i){
layer.filters[i] += step*(layer.filter_updates[i] - decay*layer.filters[i]);
layer.filter_updates[i] *= momentum;
}
axpy_cpu(layer.n, step, layer.bias_updates, 1, layer.biases, 1);
scal_cpu(layer.n, momentum, layer.bias_updates, 1);
scal_cpu(size, 1.-step*decay, layer.filters, 1);
axpy_cpu(size, step, layer.filter_updates, 1, layer.filters, 1);
scal_cpu(size, momentum, layer.filter_updates, 1);
}
void test_convolutional_layer()
......
......@@ -123,7 +123,7 @@ data load_cifar10_data(char *filename)
{
data d;
d.shallow = 0;
unsigned long i,j;
long i,j;
matrix X = make_matrix(10000, 3072);
matrix y = make_matrix(10000, 10);
d.X = X;
......
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include "opencl.h"
#include "mini_blas.h"
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
#define BLOCK 8
cl_kernel get_gemm_kernel()
{
static int init = 0;
static cl_kernel gemm_kernel;
if(!init){
gemm_kernel = get_kernel("src/gemm.cl", "gemm", "-D BLOCK=" STR(BLOCK) );
init = 1;
}
return gemm_kernel;
}
void gpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc)
{
cl_setup();
cl_kernel gemm_kernel = get_gemm_kernel();
cl_context context = cl.context;
cl_command_queue queue = cl.queue;
size_t size = sizeof(float)*(TA ? lda*K:lda*M);
cl_mem A_gpu = clCreateBuffer(context,
CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
size, A, &cl.error);
check_error(cl);
size = sizeof(float)*(TB ? ldb*N:ldb*K);
cl_mem B_gpu = clCreateBuffer(context,
CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
size, B, &cl.error);
check_error(cl);
size = sizeof(float)*(ldc*M);
cl_mem C_gpu = clCreateBuffer(context,
CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR,
size, C, &cl.error);
check_error(cl);
cl_uint i = 0;
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc);
check_error(cl);
const size_t global_size[] = {ceil((float)M/BLOCK)*BLOCK, ceil((float)N/BLOCK)*BLOCK};
const size_t local_size[] = {BLOCK, BLOCK};
//printf("%zd %zd %zd %zd\n", global_size[0], global_size[1], local_size[0], local_size[1]);
clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0);
check_error(cl);
clEnqueueReadBuffer(queue, C_gpu, CL_TRUE, 0, size, C, 0, 0, 0);
check_error(cl);
clReleaseMemObject(A_gpu);
clReleaseMemObject(B_gpu);
clReleaseMemObject(C_gpu);
}
void time_gpu_random_matrix(int TA, int TB, int m, int k, int n)
{
float *a;
if(!TA) a = random_matrix(m,k);
else a = random_matrix(k,m);
int lda = (!TA)?k:m;
float *b;
if(!TB) b = random_matrix(k,n);
else b = random_matrix(n,k);
int ldb = (!TB)?n:k;
float *c = random_matrix(m,n);
int i;
clock_t start = clock(), end;
for(i = 0; i<1000; ++i){
gpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
}
end = clock();
printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
free(a);
free(b);
free(c);
}
void test_gpu_accuracy(int TA, int TB, int m, int k, int n)
{
srand(0);
float *a;
if(!TA) a = random_matrix(m,k);
else a = random_matrix(k,m);
int lda = (!TA)?k:m;
float *b;
if(!TB) b = random_matrix(k,n);
else b = random_matrix(n,k);
int ldb = (!TB)?n:k;
float *c = random_matrix(m,n);
float *c_gpu = random_matrix(m,n);
memset(c, 0, m*n*sizeof(float));
memset(c_gpu, 0, m*n*sizeof(float));
int i;
//pm(m,k,b);
gpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n);
//pm(m, n, c_gpu);
cpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
//pm(m, n, c);
double sse = 0;
for(i = 0; i < m*n; ++i) {
//printf("%f %f\n", c[i], c_gpu[i]);
sse += pow(c[i]-c_gpu[i], 2);
}
printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g MSE\n",m,k,k,n, TA, TB, sse/(m*n));
free(a);
free(b);
free(c);
}
void test_gpu_blas()
{
test_gpu_accuracy(0,0,17,10,10);
test_gpu_accuracy(1,0,17,10,10);
test_gpu_accuracy(0,1,17,10,10);
test_gpu_accuracy(1,1,17,10,10);
test_gpu_accuracy(0,0,1000,10,100);
test_gpu_accuracy(1,0,1000,10,100);
test_gpu_accuracy(0,1,1000,10,100);
test_gpu_accuracy(1,1,1000,10,100);
time_gpu_random_matrix(0,0,1000,1000,100);
time_random_matrix(0,0,1000,1000,100);
time_gpu_random_matrix(0,1,1000,1000,100);
time_random_matrix(0,1,1000,1000,100);
time_gpu_random_matrix(1,0,1000,1000,100);
time_random_matrix(1,0,1000,1000,100);
time_gpu_random_matrix(1,1,1000,1000,100);
time_random_matrix(1,1,1000,1000,100);
}
/*
cl_kernel get_gemm_kernel_slow()
{
static int init = 0;
static cl_kernel gemm_kernel;
if(!init){
gemm_kernel = get_kernel("src/gemm.cl", "gemm_slow");
init = 1;
}
return gemm_kernel;
}
void gpu_gemm_slow(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc)
{
cl_setup();
cl_kernel gemm_kernel = get_gemm_kernel_slow();
cl_context context = cl.context;
cl_command_queue queue = cl.queue;
size_t size = sizeof(float)*(TA ? lda*K:lda*M);
cl_mem A_gpu = clCreateBuffer(context,
CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
size, A, &cl.error);
check_error(cl);
size = sizeof(float)*(TB ? ldb*N:ldb*K);
cl_mem B_gpu = clCreateBuffer(context,
CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
size, B, &cl.error);
check_error(cl);
size = sizeof(float)*(ldc*M);
cl_mem C_gpu = clCreateBuffer(context,
CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
size, C, &cl.error);
check_error(cl);
cl_uint i = 0;
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu);
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc);
check_error(cl);
const size_t global_size[] = {M, N};
clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, 0, 0, 0, 0);
clEnqueueReadBuffer(queue, C_gpu, CL_TRUE, 0, size, C, 0, 0, 0);
clReleaseMemObject(A_gpu);
clReleaseMemObject(B_gpu);
clReleaseMemObject(C_gpu);
}
*/
......@@ -11,6 +11,7 @@ list *make_list()
return l;
}
/*
void transfer_node(list *s, list *d, node *n)
{
node *prev, *next;
......@@ -22,6 +23,7 @@ void transfer_node(list *s, list *d, node *n)
if(s->front == n) s->front = next;
if(s->back == n) s->back = prev;
}
*/
void *list_pop(list *l){
if(!l->back) return 0;
......
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
......@@ -18,77 +17,7 @@ void pm(int M, int N, float *A)
printf("\n");
}
void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc)
{
gpu_gemm( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
}
void im2row(float *image, int h, int w, int c, int size, int stride, float *matrix)
{
int i;
int mc = c;
int mw = (size*size);
int mh = ((h-size)/stride+1)*((w-size)/stride+1);
int msize = mc*mw*mh;
for(i = 0; i < msize; ++i){
int channel = i/(mh*mw);
int block = (i%(mh*mw))/mw;
int position = i%mw;
int block_h = block/((w-size)/stride+1);
int block_w = block%((w-size)/stride+1);
int ph, pw, pc;
ph = position/size+block_h;
pw = position%size+block_w;
pc = channel;
matrix[i] = image[pc*h*w+ph*w+pw];
}
}
void im2col(float *image, int h, int w, int c, int size, int stride, float *matrix)
{
int b,p;
int blocks = ((h-size)/stride+1)*((w-size)/stride+1);
int pixels = (size*size*c);
for(b = 0; b < blocks; ++b){
int block_h = b/((w-size)/stride+1);
int block_w = b%((w-size)/stride+1);
for(p = 0; p < pixels; ++p){
int ph, pw, pc;
int position = p%(size*size);
pc = p/(size*size);
ph = position/size+block_h;
pw = position%size+block_w;
matrix[b+p*blocks] = image[pc*h*w+ph*w+pw];
}
}
}
//From Berkeley Vision's Caffe!
void im2col_cpu(float* data_im, const int channels,
const int height, const int width, const int ksize, const int stride,
float* data_col)
{
int c,h,w;
int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1;
int channels_col = channels * ksize * ksize;
for ( c = 0; c < channels_col; ++c) {
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
for ( h = 0; h < height_col; ++h) {
for ( w = 0; w < width_col; ++w) {
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h * stride + h_offset) * width
+ w * stride + w_offset];
}
}
}
}
//This one might be too, can't remember.
void col2im_cpu(float* data_col, const int channels,
const int height, const int width, const int ksize, const int stride,
float* data_im)
......@@ -135,7 +64,7 @@ void time_random_matrix(int TA, int TB, int m, int k, int n)
int i;
clock_t start = clock(), end;
for(i = 0; i<1000; ++i){
cpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
}
end = clock();
printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
......
......@@ -6,8 +6,9 @@ void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
float *C, int ldc);
float *random_matrix(int rows, int cols);
void time_random_matrix(int TA, int TB, int m, int k, int n);
void im2row(float *image, int h, int w, int c, int size, int stride, float *matrix);
void im2col(float *image, int h, int w, int c, int size, int stride, float *matrix);
void im2col_gpu(float* data_im, const int channels,
const int height, const int width, const int ksize, const int stride,
float* data_col);
void im2col_cpu(float* data_im, const int channels,
const int height, const int width, const int ksize, const int stride,
float* data_col);
......@@ -16,14 +17,16 @@ void col2im_cpu(float* data_col, const int channels,
float* data_im);
void test_blas();
void gpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA,
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc);
void cpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA,
void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc);
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
void scal_cpu(int N, float ALPHA, float *X, int INCX);
void test_gpu_blas();
#ifdef GPU
#include "opencl.h"
#include <stdio.h>
#include <stdlib.h>
......@@ -12,6 +13,7 @@ void check_error(cl_info info)
{
if (info.error != CL_SUCCESS) {
printf("\n Error number %d", info.error);
exit(1);
}
}
......@@ -66,6 +68,7 @@ cl_program cl_fprog(char *filename, char *options, cl_info info)
clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 4096, build_c, 0);
fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c);
}
check_error(info);
return prog;
}
......@@ -85,4 +88,4 @@ cl_kernel get_kernel(char *filename, char *kernelname, char *options)
return kernel;
}
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment