diff thesis/cortex.org @ 470:3401053124b0

integrating vision into thesis.
author Robert McIntyre <rlm@mit.edu>
date Fri, 28 Mar 2014 17:10:43 -0400
parents ae10f35022ba
children f14fa9e5b67f
line wrap: on
line diff
     1.1 --- a/thesis/cortex.org	Fri Mar 28 16:34:35 2014 -0400
     1.2 +++ b/thesis/cortex.org	Fri Mar 28 17:10:43 2014 -0400
     1.3 @@ -6,22 +6,36 @@
     1.4  #+LaTeX_CLASS_OPTIONS: [nofloat]
     1.5  
     1.6  * COMMENT templates
     1.7 -  #+caption: 
     1.8 -  #+caption: 
     1.9 -  #+caption: 
    1.10 -  #+caption: 
    1.11 -  #+name: name
    1.12 -  #+begin_listing clojure
    1.13 -  #+begin_src clojure
    1.14 -  #+end_src
    1.15 -  #+end_listing
    1.16 +   #+caption: 
    1.17 +   #+caption: 
    1.18 +   #+caption: 
    1.19 +   #+caption: 
    1.20 +   #+name: name
    1.21 +   #+begin_listing clojure
    1.22 +   #+end_listing
    1.23  
    1.24 -  #+caption: 
    1.25 -  #+caption: 
    1.26 -  #+caption: 
    1.27 -  #+name: name
    1.28 -  #+ATTR_LaTeX: :width 10cm
    1.29 -  [[./images/aurellem-gray.png]]
    1.30 +   #+caption: 
    1.31 +   #+caption: 
    1.32 +   #+caption: 
    1.33 +   #+name: name
    1.34 +   #+ATTR_LaTeX: :width 10cm
    1.35 +   [[./images/aurellem-gray.png]]
    1.36 +
    1.37 +    #+caption: 
    1.38 +    #+caption: 
    1.39 +    #+caption: 
    1.40 +    #+caption: 
    1.41 +    #+name: name
    1.42 +    #+begin_listing clojure
    1.43 +    #+end_listing
    1.44 +
    1.45 +    #+caption: 
    1.46 +    #+caption: 
    1.47 +    #+caption: 
    1.48 +    #+name: name
    1.49 +    #+ATTR_LaTeX: :width 10cm
    1.50 +    [[./images/aurellem-gray.png]]
    1.51 +
    1.52  
    1.53  * COMMENT Empathy and Embodiment as problem solving strategies
    1.54    
    1.55 @@ -942,6 +956,285 @@
    1.56  
    1.57  ** Eyes reuse standard video game components
    1.58  
    1.59 +   Vision is one of the most important senses for humans, so I need to
    1.60 +   build a simulated sense of vision for my AI. I will do this with
    1.61 +   simulated eyes. Each eye can be independently moved and should see
    1.62 +   its own version of the world depending on where it is.
    1.63 +
    1.64 +   Making these simulated eyes a reality is simple because
    1.65 +   jMonkeyEngine already contains extensive support for multiple views
    1.66 +   of the same 3D simulated world. The reason jMonkeyEngine has this
    1.67 +   support is because the support is necessary to create games with
    1.68 +   split-screen views. Multiple views are also used to create
    1.69 +   efficient pseudo-reflections by rendering the scene from a certain
    1.70 +   perspective and then projecting it back onto a surface in the 3D
    1.71 +   world.
    1.72 +
    1.73 +   #+caption: jMonkeyEngine supports multiple views to enable 
    1.74 +   #+caption: split-screen games, like GoldenEye, which was one of 
    1.75 +   #+caption: the first games to use split-screen views.
    1.76 +   #+name: name
    1.77 +   #+ATTR_LaTeX: :width 10cm
    1.78 +   [[./images/goldeneye-4-player.png]]
    1.79 +
    1.80 +*** A Brief Description of jMonkeyEngine's Rendering Pipeline
    1.81 +
    1.82 +    jMonkeyEngine allows you to create a =ViewPort=, which represents a
    1.83 +    view of the simulated world. You can create as many of these as you
    1.84 +    want. Every frame, the =RenderManager= iterates through each
    1.85 +    =ViewPort=, rendering the scene in the GPU. For each =ViewPort= there
    1.86 +    is a =FrameBuffer= which represents the rendered image in the GPU.
    1.87 +  
    1.88 +    #+caption: =ViewPorts= are cameras in the world. During each frame, 
    1.89 +    #+caption: the =RenderManager= records a snapshot of what each view 
    1.90 +    #+caption: is currently seeing; these snapshots are =FrameBuffer= objects.
    1.91 +    #+name: name
    1.92 +    #+ATTR_LaTeX: :width 10cm
    1.93 +    [[../images/diagram_rendermanager2.png]]
    1.94 +
    1.95 +    Each =ViewPort= can have any number of attached =SceneProcessor=
    1.96 +    objects, which are called every time a new frame is rendered. A
    1.97 +    =SceneProcessor= receives its =ViewPort's= =FrameBuffer= and can do
    1.98 +    whatever it wants to the data.  Often this consists of invoking GPU
    1.99 +    specific operations on the rendered image.  The =SceneProcessor= can
   1.100 +    also copy the GPU image data to RAM and process it with the CPU.
   1.101 +
   1.102 +*** Appropriating Views for Vision
   1.103 +
   1.104 +    Each eye in the simulated creature needs its own =ViewPort= so
   1.105 +    that it can see the world from its own perspective. To this
   1.106 +    =ViewPort=, I add a =SceneProcessor= that feeds the visual data to
   1.107 +    any arbitrary continuation function for further processing. That
   1.108 +    continuation function may perform both CPU and GPU operations on
   1.109 +    the data. To make this easy for the continuation function, the
   1.110 +    =SceneProcessor= maintains appropriately sized buffers in RAM to
   1.111 +    hold the data. It does not do any copying from the GPU to the CPU
   1.112 +    itself because it is a slow operation.
   1.113 +
   1.114 +    #+caption: Function to make the rendered secne in jMonkeyEngine 
   1.115 +    #+caption: available for further processing.
   1.116 +    #+name: pipeline-1 
   1.117 +    #+begin_listing clojure
   1.118 +    #+begin_src clojure
   1.119 +(defn vision-pipeline
   1.120 +  "Create a SceneProcessor object which wraps a vision processing
   1.121 +  continuation function. The continuation is a function that takes 
   1.122 +  [#^Renderer r #^FrameBuffer fb #^ByteBuffer b #^BufferedImage bi],
   1.123 +  each of which has already been appropriately sized."
   1.124 +  [continuation]
   1.125 +  (let [byte-buffer (atom nil)
   1.126 +	renderer (atom nil)
   1.127 +        image (atom nil)]
   1.128 +  (proxy [SceneProcessor] []
   1.129 +    (initialize
   1.130 +     [renderManager viewPort]
   1.131 +     (let [cam (.getCamera viewPort)
   1.132 +	   width (.getWidth cam)
   1.133 +	   height (.getHeight cam)]
   1.134 +       (reset! renderer (.getRenderer renderManager))
   1.135 +       (reset! byte-buffer
   1.136 +	     (BufferUtils/createByteBuffer
   1.137 +	      (* width height 4)))
   1.138 +        (reset! image (BufferedImage.
   1.139 +                      width height
   1.140 +                      BufferedImage/TYPE_4BYTE_ABGR))))
   1.141 +    (isInitialized [] (not (nil? @byte-buffer)))
   1.142 +    (reshape [_ _ _])
   1.143 +    (preFrame [_])
   1.144 +    (postQueue [_])
   1.145 +    (postFrame
   1.146 +     [#^FrameBuffer fb]
   1.147 +     (.clear @byte-buffer)
   1.148 +     (continuation @renderer fb @byte-buffer @image))
   1.149 +    (cleanup []))))
   1.150 +    #+end_src
   1.151 +    #+end_listing
   1.152 +
   1.153 +    The continuation function given to =vision-pipeline= above will be
   1.154 +    given a =Renderer= and three containers for image data. The
   1.155 +    =FrameBuffer= references the GPU image data, but the pixel data
   1.156 +    can not be used directly on the CPU. The =ByteBuffer= and
   1.157 +    =BufferedImage= are initially "empty" but are sized to hold the
   1.158 +    data in the =FrameBuffer=. I call transferring the GPU image data
   1.159 +    to the CPU structures "mixing" the image data.
   1.160 +
   1.161 +*** Optical sensor arrays are described with images and referenced with metadata
   1.162 +
   1.163 +    The vision pipeline described above handles the flow of rendered
   1.164 +    images. Now, =CORTEX= needs simulated eyes to serve as the source
   1.165 +    of these images.
   1.166 +
   1.167 +    An eye is described in blender in the same way as a joint. They
   1.168 +    are zero dimensional empty objects with no geometry whose local
   1.169 +    coordinate system determines the orientation of the resulting eye.
   1.170 +    All eyes are children of a parent node named "eyes" just as all
   1.171 +    joints have a parent named "joints". An eye binds to the nearest
   1.172 +    physical object with =bind-sense=.
   1.173 +
   1.174 +    #+caption: Here, the camera is created based on metadata on the
   1.175 +    #+caption: eye-node and attached to the nearest physical object 
   1.176 +    #+caption: with =bind-sense=
   1.177 +    #+name: add-eye
   1.178 +    #+begin_listing clojure
   1.179 +(defn add-eye!
   1.180 +  "Create a Camera centered on the current position of 'eye which
   1.181 +   follows the closest physical node in 'creature. The camera will
   1.182 +   point in the X direction and use the Z vector as up as determined
   1.183 +   by the rotation of these vectors in blender coordinate space. Use
   1.184 +   XZY rotation for the node in blender."
   1.185 +  [#^Node creature #^Spatial eye]
   1.186 +  (let [target (closest-node creature eye)
   1.187 +        [cam-width cam-height] 
   1.188 +        ;;[640 480] ;; graphics card on laptop doesn't support
   1.189 +                    ;; arbitray dimensions.
   1.190 +        (eye-dimensions eye)
   1.191 +        cam (Camera. cam-width cam-height)
   1.192 +        rot (.getWorldRotation eye)]
   1.193 +    (.setLocation cam (.getWorldTranslation eye))
   1.194 +    (.lookAtDirection
   1.195 +     cam                           ; this part is not a mistake and
   1.196 +     (.mult rot Vector3f/UNIT_X)   ; is consistent with using Z in
   1.197 +     (.mult rot Vector3f/UNIT_Y))  ; blender as the UP vector.
   1.198 +    (.setFrustumPerspective
   1.199 +     cam (float 45)
   1.200 +     (float (/ (.getWidth cam) (.getHeight cam)))
   1.201 +     (float 1)
   1.202 +     (float 1000))
   1.203 +    (bind-sense target cam) cam))
   1.204 +    #+end_listing
   1.205 +
   1.206 +*** Simulated Retina 
   1.207 +
   1.208 +    An eye is a surface (the retina) which contains many discrete
   1.209 +    sensors to detect light. These sensors can have different
   1.210 +    light-sensing properties. In humans, each discrete sensor is
   1.211 +    sensitive to red, blue, green, or gray. These different types of
   1.212 +    sensors can have different spatial distributions along the retina.
   1.213 +    In humans, there is a fovea in the center of the retina which has
   1.214 +    a very high density of color sensors, and a blind spot which has
   1.215 +    no sensors at all. Sensor density decreases in proportion to
   1.216 +    distance from the fovea.
   1.217 +
   1.218 +    I want to be able to model any retinal configuration, so my
   1.219 +    eye-nodes in blender contain metadata pointing to images that
   1.220 +    describe the precise position of the individual sensors using
   1.221 +    white pixels. The meta-data also describes the precise sensitivity
   1.222 +    to light that the sensors described in the image have. An eye can
   1.223 +    contain any number of these images. For example, the metadata for
   1.224 +    an eye might look like this:
   1.225 +
   1.226 +    #+begin_src clojure
   1.227 +{0xFF0000 "Models/test-creature/retina-small.png"}
   1.228 +    #+end_src
   1.229 +
   1.230 +    #+caption: An example retinal profile image. White pixels are 
   1.231 +    #+caption: photo-sensitive elements. The distribution of white 
   1.232 +    #+caption: pixels is denser in the middle and falls off at the 
   1.233 +    #+caption: edges and is inspired by the human retina.
   1.234 +    #+name: retina
   1.235 +    #+ATTR_LaTeX: :width 10cm
   1.236 +    [[./images/retina-small.png]]
   1.237 +
   1.238 +    Together, the number 0xFF0000 and the image image above describe
   1.239 +    the placement of red-sensitive sensory elements.
   1.240 +
   1.241 +    Meta-data to very crudely approximate a human eye might be
   1.242 +    something like this:
   1.243 +
   1.244 +    #+begin_src clojure
   1.245 +(let [retinal-profile "Models/test-creature/retina-small.png"]
   1.246 +  {0xFF0000 retinal-profile
   1.247 +   0x00FF00 retinal-profile
   1.248 +   0x0000FF retinal-profile
   1.249 +   0xFFFFFF retinal-profile})
   1.250 +    #+end_src
   1.251 +
   1.252 +    The numbers that serve as keys in the map determine a sensor's
   1.253 +    relative sensitivity to the channels red, green, and blue. These
   1.254 +    sensitivity values are packed into an integer in the order
   1.255 +    =|_|R|G|B|= in 8-bit fields. The RGB values of a pixel in the
   1.256 +    image are added together with these sensitivities as linear
   1.257 +    weights. Therefore, 0xFF0000 means sensitive to red only while
   1.258 +    0xFFFFFF means sensitive to all colors equally (gray).
   1.259 +
   1.260 +    #+caption: This is the core of vision in =CORTEX=. A given eye node 
   1.261 +    #+caption: is converted into a function that returns visual
   1.262 +    #+caption: information from the simulation.
   1.263 +    #+name: name
   1.264 +    #+begin_listing clojure
   1.265 +(defn vision-kernel
   1.266 +  "Returns a list of functions, each of which will return a color
   1.267 +   channel's worth of visual information when called inside a running
   1.268 +   simulation."
   1.269 +  [#^Node creature #^Spatial eye & {skip :skip :or {skip 0}}]
   1.270 +  (let [retinal-map (retina-sensor-profile eye)
   1.271 +        camera (add-eye! creature eye)
   1.272 +        vision-image
   1.273 +        (atom
   1.274 +         (BufferedImage. (.getWidth camera)
   1.275 +                         (.getHeight camera)
   1.276 +                         BufferedImage/TYPE_BYTE_BINARY))
   1.277 +        register-eye!
   1.278 +        (runonce
   1.279 +         (fn [world]
   1.280 +           (add-camera!
   1.281 +            world camera
   1.282 +            (let [counter  (atom 0)]
   1.283 +              (fn [r fb bb bi]
   1.284 +                (if (zero? (rem (swap! counter inc) (inc skip)))
   1.285 +                  (reset! vision-image
   1.286 +                          (BufferedImage! r fb bb bi))))))))]
   1.287 +     (vec
   1.288 +      (map
   1.289 +       (fn [[key image]]
   1.290 +         (let [whites (white-coordinates image)
   1.291 +               topology (vec (collapse whites))
   1.292 +               sensitivity (sensitivity-presets key key)]
   1.293 +           (attached-viewport.
   1.294 +            (fn [world]
   1.295 +              (register-eye! world)
   1.296 +              (vector
   1.297 +               topology
   1.298 +               (vec 
   1.299 +                (for [[x y] whites]
   1.300 +                  (pixel-sense 
   1.301 +                   sensitivity
   1.302 +                   (.getRGB @vision-image x y))))))
   1.303 +            register-eye!)))
   1.304 +         retinal-map))))
   1.305 +    #+end_listing
   1.306 +
   1.307 +    Note that since each of the functions generated by =vision-kernel=
   1.308 +    shares the same =register-eye!= function, the eye will be
   1.309 +    registered only once the first time any of the functions from the
   1.310 +    list returned by =vision-kernel= is called. Each of the functions
   1.311 +    returned by =vision-kernel= also allows access to the =Viewport=
   1.312 +    through which it receives images.
   1.313 +
   1.314 +    All the hard work has been done; all that remains is to apply
   1.315 +    =vision-kernel= to each eye in the creature and gather the results
   1.316 +    into one list of functions.
   1.317 +
   1.318 +
   1.319 +    #+caption: With =vision!=, =CORTEX= is already a fine simulation 
   1.320 +    #+caption: environment for experimenting with different types of 
   1.321 +    #+caption: eyes.
   1.322 +    #+name: vision!
   1.323 +    #+begin_listing clojure
   1.324 +(defn vision!
   1.325 +  "Returns a list of functions, each of which returns visual sensory
   1.326 +   data when called inside a running simulation."
   1.327 +  [#^Node creature & {skip :skip :or {skip 0}}]
   1.328 +  (reduce
   1.329 +   concat 
   1.330 +   (for [eye (eyes creature)]
   1.331 +     (vision-kernel creature eye))))
   1.332 +    #+end_listing
   1.333 +
   1.334 +
   1.335 +
   1.336 +
   1.337 +
   1.338  ** Hearing is hard; =CORTEX= does it right
   1.339  
   1.340  ** Touch uses hundreds of hair-like elements