From 40cc1046395385dd256012810866eba34904f034 Mon Sep 17 00:00:00 2001
From: Joseph Redmon <pjreddie@gmail.com>
Date: Mon, 28 Sep 2015 14:32:28 -0700
Subject: [PATCH] idk

---
 Makefile           |  4 +--
 cfg/yolo.cfg       | 10 +++----
 src/layer.h        |  1 +
 src/parser.c       |  1 +
 src/region_layer.c | 40 ++++++++++++++++++++-----
 src/swag.c         | 75 +++++++++++++++++++++++-----------------------
 src/yolo.c         |  6 +++-
 7 files changed, 83 insertions(+), 54 deletions(-)

diff --git a/Makefile b/Makefile
index cdf200c..22e89a1 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
-GPU=0
-OPENCV=0
+GPU=1
+OPENCV=1
 DEBUG=0
 
 ARCH= --gpu-architecture=compute_20 --gpu-code=compute_20
diff --git a/cfg/yolo.cfg b/cfg/yolo.cfg
index ab46729..140de88 100644
--- a/cfg/yolo.cfg
+++ b/cfg/yolo.cfg
@@ -1,17 +1,17 @@
 [net]
 batch=64
-subdivisions=64
+subdivisions=4
 height=448
 width=448
 channels=3
-learning_rate=0.001
+learning_rate=0.01
 momentum=0.9
 decay=0.0005
 
 policy=steps
-steps=50, 5000
-scales=10, .1
-max_batches = 8000
+steps=20000
+scales=.1
+max_batches = 35000
 
 [crop]
 crop_width=448
diff --git a/src/layer.h b/src/layer.h
index d13cdbf..808aba4 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -28,6 +28,7 @@ typedef struct {
     ACTIVATION activation;
     COST_TYPE cost_type;
     int batch;
+    int forced;
     int inputs;
     int outputs;
     int truths;
diff --git a/src/parser.c b/src/parser.c
index 7ea1b3f..6daeb13 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -187,6 +187,7 @@ region_layer parse_region(list *options, size_params params)
     layer.sqrt = option_find_int(options, "sqrt", 0);
 
     layer.coord_scale = option_find_float(options, "coord_scale", 1);
+    layer.forced = option_find_int(options, "forced", 0);
     layer.object_scale = option_find_float(options, "object_scale", 1);
     layer.noobject_scale = option_find_float(options, "noobject_scale", 1);
     layer.class_scale = option_find_float(options, "class_scale", 1);
diff --git a/src/region_layer.c b/src/region_layer.c
index 39af5ee..4d8c2a4 100644
--- a/src/region_layer.c
+++ b/src/region_layer.c
@@ -82,9 +82,12 @@ void forward_region_layer(const region_layer l, network_state state)
 
                 int best_index = -1;
                 float best_iou = 0;
-                float best_rmse = 4;
+                float best_rmse = 20;
 
-                if (!is_obj) continue;
+                if (!is_obj){
+                    //printf(".");
+                    continue;
+                }
 
                 int class_index = index + i*l.classes;
                 for(j = 0; j < l.classes; ++j) {
@@ -123,18 +126,38 @@ void forward_region_layer(const region_layer l, network_state state)
                         }
                     }
                 }
+
+                if(l.forced){
+                    if(truth.w*truth.h < .1){
+                        best_index = 1;
+                    }else{
+                        best_index = 0;
+                    }
+                }
+
+                int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords;
+                int tbox_index = truth_index + 1 + l.classes;
+
+                box out = float_to_box(l.output + box_index);
+                out.x /= l.side;
+                out.y /= l.side;
+                if (l.sqrt) {
+                    out.w = out.w*out.w;
+                    out.h = out.h*out.h;
+                }
+                float iou  = box_iou(out, truth);
+
+                //printf("%d", best_index);
                 int p_index = index + locations*l.classes + i*l.n + best_index;
                 *(l.cost) -= l.noobject_scale * pow(l.output[p_index], 2);
                 *(l.cost) += l.object_scale * pow(1-l.output[p_index], 2);
                 avg_obj += l.output[p_index];
-                l.delta[p_index+0] = l.object_scale * (1.-l.output[p_index]);
+                l.delta[p_index] = l.object_scale * (1.-l.output[p_index]);
 
                 if(l.rescore){
-                    l.delta[p_index+0] = l.object_scale * (best_iou - l.output[p_index]);
+                    l.delta[p_index] = l.object_scale * (iou - l.output[p_index]);
                 }
 
-                int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords;
-                int tbox_index = truth_index + 1 + l.classes;
                 l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]);
                 l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]);
                 l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]);
@@ -144,14 +167,15 @@ void forward_region_layer(const region_layer l, network_state state)
                     l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]);
                 }
 
-                *(l.cost) += pow(1-best_iou, 2);
-                avg_iou += best_iou;
+                *(l.cost) += pow(1-iou, 2);
+                avg_iou += iou;
                 ++count;
             }
             if(l.softmax){
                 gradient_array(l.output + index + locations*l.classes, locations*l.n*(1+l.coords), 
                         LOGISTIC, l.delta + index + locations*l.classes);
             }
+            //printf("\n");
         }
         printf("Region Avg IOU: %f, Pos Cat: %f, All Cat: %f, Pos Obj: %f, Any Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_allcat/(count*l.classes), avg_obj/count, avg_anyobj/(l.batch*locations*l.n), count);
     }
diff --git a/src/swag.c b/src/swag.c
index 7058df5..ec58f0d 100644
--- a/src/swag.c
+++ b/src/swag.c
@@ -1,4 +1,5 @@
 #include "network.h"
+#include "region_layer.h"
 #include "detection_layer.h"
 #include "cost_layer.h"
 #include "utils.h"
@@ -11,40 +12,37 @@
 
 char *voc_names[] = {"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"};
 
-void draw_swag(image im, float *box, int side, int objectness, char *label, float thresh)
+void draw_swag(image im, float *predictions, int side, int num, char *label, float thresh)
 {
     int classes = 20;
-    int elems = 4+classes+objectness;
-    int j;
-    int r, c;
-
-    for(r = 0; r < side; ++r){
-        for(c = 0; c < side; ++c){
-            j = (r*side + c) * elems;
-            float scale = 1;
-            if(objectness) scale = 1 - box[j++];
-            int class = max_index(box+j, classes);
-            if(scale * box[j+class] > thresh){
-                int width = sqrt(scale*box[j+class])*5 + 1;
-                printf("%f %s\n", scale * box[j+class], voc_names[class]);
+    int i,n;
+
+    for(i = 0; i < side*side; ++i){
+        int row = i / side;
+        int col = i % side;
+        for(n = 0; n < num; ++n){
+            int p_index = side*side*classes + i*num + n;
+            int box_index = side*side*(classes + num) + (i*num + n)*4;
+            int class_index = i*classes;
+            float scale = predictions[p_index];
+            int class = max_index(predictions+class_index, classes);
+            float prob = scale * predictions[class_index + class];
+            if(prob > thresh){
+                int width = sqrt(prob)*5 + 1;
+                printf("%f %s\n", prob, voc_names[class]);
                 float red = get_color(0,class,classes);
                 float green = get_color(1,class,classes);
                 float blue = get_color(2,class,classes);
-
-                j += classes;
-                float x = box[j+0];
-                float y = box[j+1];
-                x = (x+c)/side;
-                y = (y+r)/side;
-                float w = box[j+2]; //*maxwidth;
-                float h = box[j+3]; //*maxheight;
-                h = h*h;
-                w = w*w;
-
-                int left  = (x-w/2)*im.w;
-                int right = (x+w/2)*im.w;
-                int top   = (y-h/2)*im.h;
-                int bot   = (y+h/2)*im.h;
+                box b = float_to_box(predictions+box_index);
+                b.x = (b.x + col)/side;
+                b.y = (b.y + row)/side;
+                b.w = b.w*b.w;
+                b.h = b.h*b.h;
+
+                int left  = (b.x-b.w/2)*im.w;
+                int right = (b.x+b.w/2)*im.w;
+                int top   = (b.y-b.h/2)*im.h;
+                int bot   = (b.y+b.h/2)*im.h;
                 draw_box_width(im, left, top, right, bot, width, red, green, blue);
             }
         }
@@ -103,13 +101,13 @@ void train_swag(char *cfgfile, char *weightfile)
 
         printf("Loaded: %lf seconds\n", sec(clock()-time));
 
-/*
-        image im = float_to_image(net.w, net.h, 3, train.X.vals[113]);
-        image copy = copy_image(im);
-        draw_swag(copy, train.y.vals[113], 7, "truth");
-        cvWaitKey(0);
-        free_image(copy);
-        */
+        /*
+           image im = float_to_image(net.w, net.h, 3, train.X.vals[113]);
+           image copy = copy_image(im);
+           draw_swag(copy, train.y.vals[113], 7, "truth");
+           cvWaitKey(0);
+           free_image(copy);
+         */
 
         time=clock();
         float loss = train_network(net, train);
@@ -270,7 +268,7 @@ void test_swag(char *cfgfile, char *weightfile, char *filename, float thresh)
     if(weightfile){
         load_weights(&net, weightfile);
     }
-    detection_layer layer = get_network_detection_layer(net);
+    region_layer layer = net.layers[net.n-1];
     set_batch_network(&net, 1);
     srand(2222222);
     clock_t time;
@@ -292,7 +290,8 @@ void test_swag(char *cfgfile, char *weightfile, char *filename, float thresh)
         time=clock();
         float *predictions = network_predict(net, X);
         printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        draw_swag(im, predictions, 7, layer.objectness, "predictions", thresh);
+        draw_swag(im, predictions, layer.side, layer.n, "predictions", thresh);
+        show_image(sized, "resized");
         free_image(im);
         free_image(sized);
 #ifdef OPENCV
diff --git a/src/yolo.c b/src/yolo.c
index b2c89d8..4b241f3 100644
--- a/src/yolo.c
+++ b/src/yolo.c
@@ -65,7 +65,6 @@ void train_yolo(char *cfgfile, char *weightfile)
     if(weightfile){
         load_weights(&net, weightfile);
     }
-    detection_layer layer = get_network_detection_layer(net);
     int imgs = 128;
     int i = *net.seen/imgs;
 
@@ -74,11 +73,16 @@ void train_yolo(char *cfgfile, char *weightfile)
     int N = plist->size;
     paths = (char **)list_to_array(plist);
 
+    if(i*imgs > N*80){
+        net.layers[net.n-1].objectness = 0;
+        net.layers[net.n-1].joint = 1;
+    }
     if(i*imgs > N*120){
         net.layers[net.n-1].rescore = 1;
     }
     data train, buffer;
 
+    detection_layer layer = get_network_detection_layer(net);
     int classes = layer.classes;
     int background = layer.objectness;
     int side = sqrt(get_detection_layer_locations(layer));
-- 
GitLab