changeset 401:7ee735a836da

incorporate thesis.
author Robert McIntyre <rlm@mit.edu>
date Sun, 16 Mar 2014 23:31:16 -0400
parents 6ba908c1a0a9
children a533a0038bd7
files thesis/images/cat-drinking.jpg thesis/images/finger-UV.png thesis/org/first-chapter.html thesis/org/first-chapter.org thesis/org/roadmap.org
diffstat 5 files changed, 882 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
     1.1 Binary file thesis/images/cat-drinking.jpg has changed
     2.1 Binary file thesis/images/finger-UV.png has changed
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/thesis/org/first-chapter.html	Sun Mar 16 23:31:16 2014 -0400
     3.3 @@ -0,0 +1,455 @@
     3.4 +<?xml version="1.0" encoding="utf-8"?>
     3.5 +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
     3.6 +               "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
     3.7 +<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
     3.8 +<head>
     3.9 +<title><code>CORTEX</code></title>
    3.10 +<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
    3.11 +<meta name="title" content="<code>CORTEX</code>"/>
    3.12 +<meta name="generator" content="Org-mode"/>
    3.13 +<meta name="generated" content="2013-11-07 04:21:29 EST"/>
    3.14 +<meta name="author" content="Robert McIntyre"/>
    3.15 +<meta name="description" content="Using embodied AI to facilitate Artificial Imagination."/>
    3.16 +<meta name="keywords" content="AI, clojure, embodiment"/>
    3.17 +<style type="text/css">
    3.18 + <!--/*--><![CDATA[/*><!--*/
    3.19 +  html { font-family: Times, serif; font-size: 12pt; }
    3.20 +  .title  { text-align: center; }
    3.21 +  .todo   { color: red; }
    3.22 +  .done   { color: green; }
    3.23 +  .tag    { background-color: #add8e6; font-weight:normal }
    3.24 +  .target { }
    3.25 +  .timestamp { color: #bebebe; }
    3.26 +  .timestamp-kwd { color: #5f9ea0; }
    3.27 +  .right  {margin-left:auto; margin-right:0px;  text-align:right;}
    3.28 +  .left   {margin-left:0px;  margin-right:auto; text-align:left;}
    3.29 +  .center {margin-left:auto; margin-right:auto; text-align:center;}
    3.30 +  p.verse { margin-left: 3% }
    3.31 +  pre {
    3.32 +	border: 1pt solid #AEBDCC;
    3.33 +	background-color: #F3F5F7;
    3.34 +	padding: 5pt;
    3.35 +	font-family: courier, monospace;
    3.36 +        font-size: 90%;
    3.37 +        overflow:auto;
    3.38 +  }
    3.39 +  table { border-collapse: collapse; }
    3.40 +  td, th { vertical-align: top;  }
    3.41 +  th.right  { text-align:center;  }
    3.42 +  th.left   { text-align:center;   }
    3.43 +  th.center { text-align:center; }
    3.44 +  td.right  { text-align:right;  }
    3.45 +  td.left   { text-align:left;   }
    3.46 +  td.center { text-align:center; }
    3.47 +  dt { font-weight: bold; }
    3.48 +  div.figure { padding: 0.5em; }
    3.49 +  div.figure p { text-align: center; }
    3.50 +  div.inlinetask {
    3.51 +    padding:10px;
    3.52 +    border:2px solid gray;
    3.53 +    margin:10px;
    3.54 +    background: #ffffcc;
    3.55 +  }
    3.56 +  textarea { overflow-x: auto; }
    3.57 +  .linenr { font-size:smaller }
    3.58 +  .code-highlighted {background-color:#ffff00;}
    3.59 +  .org-info-js_info-navigation { border-style:none; }
    3.60 +  #org-info-js_console-label { font-size:10px; font-weight:bold;
    3.61 +                               white-space:nowrap; }
    3.62 +  .org-info-js_search-highlight {background-color:#ffff00; color:#000000;
    3.63 +                                 font-weight:bold; }
    3.64 +  /*]]>*/-->
    3.65 +</style>
    3.66 +<script type="text/javascript">var _gaq = _gaq || [];_gaq.push(['_setAccount', 'UA-31261312-1']);_gaq.push(['_trackPageview']);(function() {var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);})();</script><link rel="stylesheet" type="text/css" href="../../aurellem/css/argentum.css" />
    3.67 +<script type="text/javascript">
    3.68 +<!--/*--><![CDATA[/*><!--*/
    3.69 + function CodeHighlightOn(elem, id)
    3.70 + {
    3.71 +   var target = document.getElementById(id);
    3.72 +   if(null != target) {
    3.73 +     elem.cacheClassElem = elem.className;
    3.74 +     elem.cacheClassTarget = target.className;
    3.75 +     target.className = "code-highlighted";
    3.76 +     elem.className   = "code-highlighted";
    3.77 +   }
    3.78 + }
    3.79 + function CodeHighlightOff(elem, id)
    3.80 + {
    3.81 +   var target = document.getElementById(id);
    3.82 +   if(elem.cacheClassElem)
    3.83 +     elem.className = elem.cacheClassElem;
    3.84 +   if(elem.cacheClassTarget)
    3.85 +     target.className = elem.cacheClassTarget;
    3.86 + }
    3.87 +/*]]>*///-->
    3.88 +</script>
    3.89 +
    3.90 +</head>
    3.91 +<body>
    3.92 +
    3.93 +
    3.94 +<div id="content">
    3.95 +<h1 class="title"><code>CORTEX</code></h1>
    3.96 +
    3.97 +
    3.98 +<div class="header">
    3.99 +  <div class="float-right">	
   3.100 +    <!-- 
   3.101 +    <form>
   3.102 +      <input type="text"/><input type="submit" value="search the blog &raquo;"/> 
   3.103 +    </form>
   3.104 +    -->
   3.105 +  </div>
   3.106 +
   3.107 +  <h1>aurellem <em>&#x2609;</em></h1>
   3.108 +  <ul class="nav">
   3.109 +    <li><a href="/">read the blog &raquo;</a></li>
   3.110 +    <!-- li><a href="#">learn about us &raquo;</a></li-->
   3.111 +  </ul>
   3.112 +</div>
   3.113 +
   3.114 +<div class="author">Written by <author>Robert McIntyre</author></div>
   3.115 +
   3.116 +
   3.117 +
   3.118 +
   3.119 +
   3.120 +
   3.121 +
   3.122 +<div id="outline-container-1" class="outline-2">
   3.123 +<h2 id="sec-1">Artificial Imagination</h2>
   3.124 +<div class="outline-text-2" id="text-1">
   3.125 +
   3.126 +
   3.127 +<p>
   3.128 +  Imagine watching a video of someone skateboarding. When you watch
   3.129 +  the video, you can imagine yourself skateboarding, and your
   3.130 +  knowledge of the human body and its dynamics guides your
   3.131 +  interpretation of the scene. For example, even if the skateboarder
   3.132 +  is partially occluded, you can infer the positions of his arms and
   3.133 +  body from your own knowledge of how your body would be positioned if
   3.134 +  you were skateboarding. If the skateboarder suffers an accident, you
   3.135 +  wince in sympathy, imagining the pain your own body would experience
   3.136 +  if it were in the same situation. This empathy with other people
   3.137 +  guides our understanding of whatever they are doing because it is a
   3.138 +  powerful constraint on what is probable and possible. In order to
   3.139 +  make use of this powerful empathy constraint, I need a system that
   3.140 +  can generate and make sense of sensory data from the many different
   3.141 +  senses that humans possess. The two key proprieties of such a system
   3.142 +  are <i>embodiment</i> and <i>imagination</i>.
   3.143 +</p>
   3.144 +
   3.145 +</div>
   3.146 +
   3.147 +<div id="outline-container-1-1" class="outline-3">
   3.148 +<h3 id="sec-1-1">What is imagination?</h3>
   3.149 +<div class="outline-text-3" id="text-1-1">
   3.150 +
   3.151 +
   3.152 +<p>
   3.153 +   One kind of imagination is <i>sympathetic</i> imagination: you imagine
   3.154 +   yourself in the position of something/someone you are
   3.155 +   observing. This type of imagination comes into play when you follow
   3.156 +   along visually when watching someone perform actions, or when you
   3.157 +   sympathetically grimace when someone hurts themselves. This type of
   3.158 +   imagination uses the constraints you have learned about your own
   3.159 +   body to highly constrain the possibilities in whatever you are
   3.160 +   seeing. It uses all your senses to including your senses of touch,
   3.161 +   proprioception, etc. Humans are flexible when it comes to "putting
   3.162 +   themselves in another's shoes," and can sympathetically understand
   3.163 +   not only other humans, but entities ranging animals to cartoon
   3.164 +   characters to <a href="http://www.youtube.com/watch?v=0jz4HcwTQmU">single dots</a> on a screen!
   3.165 +</p>
   3.166 +<p>
   3.167 +   Another kind of imagination is <i>predictive</i> imagination: you
   3.168 +   construct scenes in your mind that are not entirely related to
   3.169 +   whatever you are observing, but instead are predictions of the
   3.170 +   future or simply flights of fancy. You use this type of imagination
   3.171 +   to plan out multi-step actions, or play out dangerous situations in
   3.172 +   your mind so as to avoid messing them up in reality.
   3.173 +</p>
   3.174 +<p>
   3.175 +   Of course, sympathetic and predictive imagination blend into each
   3.176 +   other and are not completely separate concepts. One dimension along
   3.177 +   which you can distinguish types of imagination is dependence on raw
   3.178 +   sense data. Sympathetic imagination is highly constrained by your
   3.179 +   senses, while predictive imagination can be more or less dependent
   3.180 +   on your senses depending on how far ahead you imagine. Daydreaming
   3.181 +   is an extreme form of predictive imagination that wanders through
   3.182 +   different possibilities without concern for whether they are
   3.183 +   related to whatever is happening in reality.
   3.184 +</p>
   3.185 +<p>
   3.186 +   For this thesis, I will mostly focus on sympathetic imagination and
   3.187 +   the constraint it provides for understanding sensory data.
   3.188 +</p>
   3.189 +</div>
   3.190 +
   3.191 +</div>
   3.192 +
   3.193 +<div id="outline-container-1-2" class="outline-3">
   3.194 +<h3 id="sec-1-2">What problems can imagination solve?</h3>
   3.195 +<div class="outline-text-3" id="text-1-2">
   3.196 +
   3.197 +
   3.198 +<p>
   3.199 +   Consider a video of a cat drinking some water.
   3.200 +</p>
   3.201 +
   3.202 +<div class="figure">
   3.203 +<p><img src="../images/cat-drinking.jpg"  alt="../images/cat-drinking.jpg" /></p>
   3.204 +<p>A cat drinking some water. Identifying this action is beyond the state of the art for computers.</p>
   3.205 +</div>
   3.206 +
   3.207 +<p>
   3.208 +   It is currently impossible for any computer program to reliably
   3.209 +   label such an video as "drinking". I think humans are able to label
   3.210 +   such video as "drinking" because they imagine <i>themselves</i> as the
   3.211 +   cat, and imagine putting their face up against a stream of water
   3.212 +   and sticking out their tongue. In that imagined world, they can
   3.213 +   feel the cool water hitting their tongue, and feel the water
   3.214 +   entering their body, and are able to recognize that <i>feeling</i> as
   3.215 +   drinking. So, the label of the action is not really in the pixels
   3.216 +   of the image, but is found clearly in a simulation inspired by
   3.217 +   those pixels. An imaginative system, having been trained on
   3.218 +   drinking and non-drinking examples and learning that the most
   3.219 +   important component of drinking is the feeling of water sliding
   3.220 +   down one's throat, would analyze a video of a cat drinking in the
   3.221 +   following manner:
   3.222 +</p>
   3.223 +<ul>
   3.224 +<li>Create a physical model of the video by putting a "fuzzy" model
   3.225 +     of its own body in place of the cat. Also, create a simulation of
   3.226 +     the stream of water.
   3.227 +
   3.228 +</li>
   3.229 +<li>Play out this simulated scene and generate imagined sensory
   3.230 +     experience. This will include relevant muscle contractions, a
   3.231 +     close up view of the stream from the cat's perspective, and most
   3.232 +     importantly, the imagined feeling of water entering the mouth.
   3.233 +
   3.234 +</li>
   3.235 +<li>The action is now easily identified as drinking by the sense of
   3.236 +     taste alone. The other senses (such as the tongue moving in and
   3.237 +     out) help to give plausibility to the simulated action. Note that
   3.238 +     the sense of vision, while critical in creating the simulation,
   3.239 +     is not critical for identifying the action from the simulation.
   3.240 +</li>
   3.241 +</ul>
   3.242 +
   3.243 +
   3.244 +<p>
   3.245 +   More generally, I expect imaginative systems to be particularly
   3.246 +   good at identifying embodied actions in videos.
   3.247 +</p>
   3.248 +</div>
   3.249 +</div>
   3.250 +
   3.251 +</div>
   3.252 +
   3.253 +<div id="outline-container-2" class="outline-2">
   3.254 +<h2 id="sec-2">Cortex</h2>
   3.255 +<div class="outline-text-2" id="text-2">
   3.256 +
   3.257 +
   3.258 +<p>
   3.259 +  The previous example involves liquids, the sense of taste, and
   3.260 +  imagining oneself as a cat. For this thesis I constrain myself to
   3.261 +  simpler, more easily digitizable senses and situations.
   3.262 +</p>
   3.263 +<p>
   3.264 +  My system, <code>Cortex</code> performs imagination in two different simplified
   3.265 +  worlds: <i>worm world</i> and <i>stick figure world</i>. In each of these
   3.266 +  worlds, entities capable of imagination recognize actions by
   3.267 +  simulating the experience from their own perspective, and then
   3.268 +  recognizing the action from a database of examples.
   3.269 +</p>
   3.270 +<p>
   3.271 +  In order to serve as a framework for experiments in imagination,
   3.272 +  <code>Cortex</code> requires simulated bodies, worlds, and senses like vision,
   3.273 +  hearing, touch, proprioception, etc.
   3.274 +</p>
   3.275 +
   3.276 +</div>
   3.277 +
   3.278 +<div id="outline-container-2-1" class="outline-3">
   3.279 +<h3 id="sec-2-1">A Video Game Engine takes care of some of the groundwork</h3>
   3.280 +<div class="outline-text-3" id="text-2-1">
   3.281 +
   3.282 +
   3.283 +<p>
   3.284 +   When it comes to simulation environments, the engines used to
   3.285 +   create the worlds in video games offer top-notch physics and
   3.286 +   graphics support. These engines also have limited support for
   3.287 +   creating cameras and rendering 3D sound, which can be repurposed
   3.288 +   for vision and hearing respectively. Physics collision detection
   3.289 +   can be expanded to create a sense of touch.
   3.290 +</p>
   3.291 +<p>   
   3.292 +   jMonkeyEngine3 is one such engine for creating video games in
   3.293 +   Java. It uses OpenGL to render to the screen and uses screengraphs
   3.294 +   to avoid drawing things that do not appear on the screen. It has an
   3.295 +   active community and several games in the pipeline. The engine was
   3.296 +   not built to serve any particular game but is instead meant to be
   3.297 +   used for any 3D game. I chose jMonkeyEngine3 it because it had the
   3.298 +   most features out of all the open projects I looked at, and because
   3.299 +   I could then write my code in Clojure, an implementation of LISP
   3.300 +   that runs on the JVM.
   3.301 +</p>
   3.302 +</div>
   3.303 +
   3.304 +</div>
   3.305 +
   3.306 +<div id="outline-container-2-2" class="outline-3">
   3.307 +<h3 id="sec-2-2"><code>CORTEX</code> Extends jMonkeyEngine3 to implement rich senses</h3>
   3.308 +<div class="outline-text-3" id="text-2-2">
   3.309 +
   3.310 +
   3.311 +<p>
   3.312 +   Using the game-making primitives provided by jMonkeyEngine3, I have
   3.313 +   constructed every major human sense except for smell and
   3.314 +   taste. <code>Cortex</code> also provides an interface for creating creatures
   3.315 +   in Blender, a 3D modeling environment, and then "rigging" the
   3.316 +   creatures with senses using 3D annotations in Blender. A creature
   3.317 +   can have any number of senses, and there can be any number of
   3.318 +   creatures in a simulation.
   3.319 +</p>
   3.320 +<p>   
   3.321 +   The senses available in <code>Cortex</code> are:
   3.322 +</p>
   3.323 +<ul>
   3.324 +<li><a href="../../cortex/html/vision.html">Vision</a>
   3.325 +</li>
   3.326 +<li><a href="../../cortex/html/hearing.html">Hearing</a>
   3.327 +</li>
   3.328 +<li><a href="../../cortex/html/touch.html">Touch</a>
   3.329 +</li>
   3.330 +<li><a href="../../cortex/html/proprioception.html">Proprioception</a>
   3.331 +</li>
   3.332 +<li><a href="../../cortex/html/movement.html">Muscle Tension</a>
   3.333 +</li>
   3.334 +</ul>
   3.335 +
   3.336 +
   3.337 +</div>
   3.338 +</div>
   3.339 +
   3.340 +</div>
   3.341 +
   3.342 +<div id="outline-container-3" class="outline-2">
   3.343 +<h2 id="sec-3">A roadmap for <code>Cortex</code> experiments</h2>
   3.344 +<div class="outline-text-2" id="text-3">
   3.345 +
   3.346 +
   3.347 +
   3.348 +</div>
   3.349 +
   3.350 +<div id="outline-container-3-1" class="outline-3">
   3.351 +<h3 id="sec-3-1">Worm World</h3>
   3.352 +<div class="outline-text-3" id="text-3-1">
   3.353 +
   3.354 +
   3.355 +<p>
   3.356 +   Worms in <code>Cortex</code> are segmented creatures which vary in length and
   3.357 +   number of segments, and have the senses of vision, proprioception,
   3.358 +   touch, and muscle tension.
   3.359 +</p>
   3.360 +
   3.361 +<div class="figure">
   3.362 +<p><img src="../images/finger-UV.png" width=755 alt="../images/finger-UV.png" /></p>
   3.363 +<p>This is the tactile-sensor-profile for the upper segment of a worm. It defines regions of high touch sensitivity (where there are many white pixels) and regions of low sensitivity (where white pixels are sparse).</p>
   3.364 +</div>
   3.365 +
   3.366 +
   3.367 +
   3.368 +
   3.369 +<div class="figure">
   3.370 +  <center>
   3.371 +    <video controls="controls" width="550">
   3.372 +      <source src="../video/worm-touch.ogg" type="video/ogg"
   3.373 +              preload="none" />
   3.374 +    </video>
   3.375 +    <br> <a href="http://youtu.be/RHx2wqzNVcU"> YouTube </a>
   3.376 +  </center>
   3.377 +  <p>The worm responds to touch.</p>
   3.378 +</div>
   3.379 +
   3.380 +<div class="figure">
   3.381 +  <center>
   3.382 +    <video controls="controls" width="550">
   3.383 +      <source src="../video/test-proprioception.ogg" type="video/ogg"
   3.384 +              preload="none" />
   3.385 +    </video>
   3.386 +    <br> <a href="http://youtu.be/JjdDmyM8b0w"> YouTube </a>
   3.387 +  </center>
   3.388 +  <p>Proprioception in a worm. The proprioceptive readout is
   3.389 +    in the upper left corner of the screen.</p>
   3.390 +</div>
   3.391 +
   3.392 +<p>
   3.393 +   A worm is trained in various actions such as sinusoidal movement,
   3.394 +   curling, flailing, and spinning by directly playing motor
   3.395 +   contractions while the worm "feels" the experience. These actions
   3.396 +   are recorded both as vectors of muscle tension, touch, and
   3.397 +   proprioceptive data, but also in higher level forms such as
   3.398 +   frequencies of the various contractions and a symbolic name for the
   3.399 +   action.
   3.400 +</p>
   3.401 +<p>
   3.402 +   Then, the worm watches a video of another worm performing one of
   3.403 +   the actions, and must judge which action was performed. Normally
   3.404 +   this would be an extremely difficult problem, but the worm is able
   3.405 +   to greatly diminish the search space through sympathetic
   3.406 +   imagination. First, it creates an imagined copy of its body which
   3.407 +   it observes from a third person point of view. Then for each frame
   3.408 +   of the video, it maneuvers its simulated body to be in registration
   3.409 +   with the worm depicted in the video. The physical constraints
   3.410 +   imposed by the physics simulation greatly decrease the number of
   3.411 +   poses that have to be tried, making the search feasible. As the
   3.412 +   imaginary worm moves, it generates imaginary muscle tension and
   3.413 +   proprioceptive sensations. The worm determines the action not by
   3.414 +   vision, but by matching the imagined proprioceptive data with
   3.415 +   previous examples.
   3.416 +</p>
   3.417 +<p>
   3.418 +   By using non-visual sensory data such as touch, the worms can also
   3.419 +   answer body related questions such as "did your head touch your
   3.420 +   tail?" and "did worm A touch worm B?"
   3.421 +</p>
   3.422 +<p>
   3.423 +   The proprioceptive information used for action identification is
   3.424 +   body-centric, so only the registration step is dependent on point
   3.425 +   of view, not the identification step. Registration is not specific
   3.426 +   to any particular action. Thus, action identification can be
   3.427 +   divided into a point-of-view dependent generic registration step,
   3.428 +   and a action-specific step that is body-centered and invariant to
   3.429 +   point of view.
   3.430 +</p>
   3.431 +</div>
   3.432 +
   3.433 +</div>
   3.434 +
   3.435 +<div id="outline-container-3-2" class="outline-3">
   3.436 +<h3 id="sec-3-2">Stick Figure World</h3>
   3.437 +<div class="outline-text-3" id="text-3-2">
   3.438 +
   3.439 +
   3.440 +<p>
   3.441 +   This environment is similar to Worm World, except the creatures are
   3.442 +   more complicated and the actions and questions more varied. It is
   3.443 +   an experiment to see how far imagination can go in interpreting
   3.444 +   actions.  
   3.445 +</p></div>
   3.446 +</div>
   3.447 +</div>
   3.448 +</div>
   3.449 +
   3.450 +<div id="postamble">
   3.451 +<p class="date">Date: 2013-11-07 04:21:29 EST</p>
   3.452 +<p class="author">Author: Robert McIntyre</p>
   3.453 +<p class="creator">Org version 7.7 with Emacs version 24</p>
   3.454 +<a href="http://validator.w3.org/check?uri=referer">Validate XHTML 1.0</a>
   3.455 +
   3.456 +</div>
   3.457 +</body>
   3.458 +</html>
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/thesis/org/first-chapter.org	Sun Mar 16 23:31:16 2014 -0400
     4.3 @@ -0,0 +1,238 @@
     4.4 +#+title: =CORTEX=
     4.5 +#+author: Robert McIntyre
     4.6 +#+email: rlm@mit.edu
     4.7 +#+description: Using embodied AI to facilitate Artificial Imagination.
     4.8 +#+keywords: AI, clojure, embodiment
     4.9 +#+SETUPFILE: ../../aurellem/org/setup.org
    4.10 +#+INCLUDE: ../../aurellem/org/level-0.org
    4.11 +#+babel: :mkdirp yes :noweb yes :exports both
    4.12 +#+OPTIONS: toc:nil, num:nil
    4.13 +
    4.14 +* Artificial Imagination
    4.15 +
    4.16 +  Imagine watching a video of someone skateboarding. When you watch
    4.17 +  the video, you can imagine yourself skateboarding, and your
    4.18 +  knowledge of the human body and its dynamics guides your
    4.19 +  interpretation of the scene. For example, even if the skateboarder
    4.20 +  is partially occluded, you can infer the positions of his arms and
    4.21 +  body from your own knowledge of how your body would be positioned if
    4.22 +  you were skateboarding. If the skateboarder suffers an accident, you
    4.23 +  wince in sympathy, imagining the pain your own body would experience
    4.24 +  if it were in the same situation. This empathy with other people
    4.25 +  guides our understanding of whatever they are doing because it is a
    4.26 +  powerful constraint on what is probable and possible. In order to
    4.27 +  make use of this powerful empathy constraint, I need a system that
    4.28 +  can generate and make sense of sensory data from the many different
    4.29 +  senses that humans possess. The two key proprieties of such a system
    4.30 +  are /embodiment/ and /imagination/.
    4.31 +
    4.32 +** What is imagination?
    4.33 +
    4.34 +   One kind of imagination is /sympathetic/ imagination: you imagine
    4.35 +   yourself in the position of something/someone you are
    4.36 +   observing. This type of imagination comes into play when you follow
    4.37 +   along visually when watching someone perform actions, or when you
    4.38 +   sympathetically grimace when someone hurts themselves. This type of
    4.39 +   imagination uses the constraints you have learned about your own
    4.40 +   body to highly constrain the possibilities in whatever you are
    4.41 +   seeing. It uses all your senses to including your senses of touch,
    4.42 +   proprioception, etc. Humans are flexible when it comes to "putting
    4.43 +   themselves in another's shoes," and can sympathetically understand
    4.44 +   not only other humans, but entities ranging from animals to cartoon
    4.45 +   characters to [[http://www.youtube.com/watch?v=0jz4HcwTQmU][single dots]] on a screen!
    4.46 +
    4.47 +   Another kind of imagination is /predictive/ imagination: you
    4.48 +   construct scenes in your mind that are not entirely related to
    4.49 +   whatever you are observing, but instead are predictions of the
    4.50 +   future or simply flights of fancy. You use this type of imagination
    4.51 +   to plan out multi-step actions, or play out dangerous situations in
    4.52 +   your mind so as to avoid messing them up in reality.
    4.53 +
    4.54 +   Of course, sympathetic and predictive imagination blend into each
    4.55 +   other and are not completely separate concepts. One dimension along
    4.56 +   which you can distinguish types of imagination is dependence on raw
    4.57 +   sense data. Sympathetic imagination is highly constrained by your
    4.58 +   senses, while predictive imagination can be more or less dependent
    4.59 +   on your senses depending on how far ahead you imagine. Daydreaming
    4.60 +   is an extreme form of predictive imagination that wanders through
    4.61 +   different possibilities without concern for whether they are
    4.62 +   related to whatever is happening in reality.
    4.63 +
    4.64 +   For this thesis, I will mostly focus on sympathetic imagination and
    4.65 +   the constraint it provides for understanding sensory data.
    4.66 +   
    4.67 +** What problems can imagination solve?
    4.68 +
    4.69 +   Consider a video of a cat drinking some water.
    4.70 +
    4.71 +   #+caption: A cat drinking some water. Identifying this action is beyond the state of the art for computers.
    4.72 +   #+ATTR_LaTeX: width=5cm
    4.73 +   [[../images/cat-drinking.jpg]]
    4.74 +
    4.75 +   It is currently impossible for any computer program to reliably
    4.76 +   label such an video as "drinking". I think humans are able to label
    4.77 +   such video as "drinking" because they imagine /themselves/ as the
    4.78 +   cat, and imagine putting their face up against a stream of water
    4.79 +   and sticking out their tongue. In that imagined world, they can
    4.80 +   feel the cool water hitting their tongue, and feel the water
    4.81 +   entering their body, and are able to recognize that /feeling/ as
    4.82 +   drinking. So, the label of the action is not really in the pixels
    4.83 +   of the image, but is found clearly in a simulation inspired by
    4.84 +   those pixels. An imaginative system, having been trained on
    4.85 +   drinking and non-drinking examples and learning that the most
    4.86 +   important component of drinking is the feeling of water sliding
    4.87 +   down one's throat, would analyze a video of a cat drinking in the
    4.88 +   following manner:
    4.89 +   
    4.90 +   - Create a physical model of the video by putting a "fuzzy" model
    4.91 +     of its own body in place of the cat. Also, create a simulation of
    4.92 +     the stream of water.
    4.93 +
    4.94 +   - Play out this simulated scene and generate imagined sensory
    4.95 +     experience. This will include relevant muscle contractions, a
    4.96 +     close up view of the stream from the cat's perspective, and most
    4.97 +     importantly, the imagined feeling of water entering the mouth.
    4.98 +
    4.99 +   - The action is now easily identified as drinking by the sense of
   4.100 +     taste alone. The other senses (such as the tongue moving in and
   4.101 +     out) help to give plausibility to the simulated action. Note that
   4.102 +     the sense of vision, while critical in creating the simulation,
   4.103 +     is not critical for identifying the action from the simulation.
   4.104 +
   4.105 +   More generally, I expect imaginative systems to be particularly
   4.106 +   good at identifying embodied actions in videos.
   4.107 +
   4.108 +* Cortex
   4.109 +
   4.110 +  The previous example involves liquids, the sense of taste, and
   4.111 +  imagining oneself as a cat. For this thesis I constrain myself to
   4.112 +  simpler, more easily digitizable senses and situations.
   4.113 +
   4.114 +  My system, =CORTEX= performs imagination in two different simplified
   4.115 +  worlds: /worm world/ and /stick-figure world/. In each of these
   4.116 +  worlds, entities capable of imagination recognize actions by
   4.117 +  simulating the experience from their own perspective, and then
   4.118 +  recognizing the action from a database of examples.
   4.119 +
   4.120 +  In order to serve as a framework for experiments in imagination,
   4.121 +  =CORTEX= requires simulated bodies, worlds, and senses like vision,
   4.122 +  hearing, touch, proprioception, etc.
   4.123 +
   4.124 +** A Video Game Engine takes care of some of the groundwork
   4.125 +
   4.126 +   When it comes to simulation environments, the engines used to
   4.127 +   create the worlds in video games offer top-notch physics and
   4.128 +   graphics support. These engines also have limited support for
   4.129 +   creating cameras and rendering 3D sound, which can be repurposed
   4.130 +   for vision and hearing respectively. Physics collision detection
   4.131 +   can be expanded to create a sense of touch.
   4.132 +   
   4.133 +   jMonkeyEngine3 is one such engine for creating video games in
   4.134 +   Java. It uses OpenGL to render to the screen and uses screengraphs
   4.135 +   to avoid drawing things that do not appear on the screen. It has an
   4.136 +   active community and several games in the pipeline. The engine was
   4.137 +   not built to serve any particular game but is instead meant to be
   4.138 +   used for any 3D game. I chose jMonkeyEngine3 it because it had the
   4.139 +   most features out of all the open projects I looked at, and because
   4.140 +   I could then write my code in Clojure, an implementation of LISP
   4.141 +   that runs on the JVM.
   4.142 +
   4.143 +** =CORTEX= Extends jMonkeyEngine3 to implement rich senses
   4.144 +
   4.145 +   Using the game-making primitives provided by jMonkeyEngine3, I have
   4.146 +   constructed every major human sense except for smell and
   4.147 +   taste. =CORTEX= also provides an interface for creating creatures
   4.148 +   in Blender, a 3D modeling environment, and then "rigging" the
   4.149 +   creatures with senses using 3D annotations in Blender. A creature
   4.150 +   can have any number of senses, and there can be any number of
   4.151 +   creatures in a simulation.
   4.152 +   
   4.153 +   The senses available in =CORTEX= are:
   4.154 +
   4.155 +   - [[../../cortex/html/vision.html][Vision]]
   4.156 +   - [[../../cortex/html/hearing.html][Hearing]]
   4.157 +   - [[../../cortex/html/touch.html][Touch]]
   4.158 +   - [[../../cortex/html/proprioception.html][Proprioception]]
   4.159 +   - [[../../cortex/html/movement.html][Muscle Tension]]
   4.160 +
   4.161 +* A roadmap for =CORTEX= experiments
   4.162 +
   4.163 +** Worm World
   4.164 +
   4.165 +   Worms in =CORTEX= are segmented creatures which vary in length and
   4.166 +   number of segments, and have the senses of vision, proprioception,
   4.167 +   touch, and muscle tension.
   4.168 +
   4.169 +#+attr_html: width=755
   4.170 +#+caption: This is the tactile-sensor-profile for the upper segment of a worm. It defines regions of high touch sensitivity (where there are many white pixels) and regions of low sensitivity (where white pixels are sparse).
   4.171 +[[../images/finger-UV.png]]
   4.172 +
   4.173 +
   4.174 +#+begin_html
   4.175 +<div class="figure">
   4.176 +  <center>
   4.177 +    <video controls="controls" width="550">
   4.178 +      <source src="../video/worm-touch.ogg" type="video/ogg"
   4.179 +	      preload="none" />
   4.180 +    </video>
   4.181 +    <br> <a href="http://youtu.be/RHx2wqzNVcU"> YouTube </a>
   4.182 +  </center>
   4.183 +  <p>The worm responds to touch.</p>
   4.184 +</div>
   4.185 +#+end_html
   4.186 +
   4.187 +#+begin_html
   4.188 +<div class="figure">
   4.189 +  <center>
   4.190 +    <video controls="controls" width="550">
   4.191 +      <source src="../video/test-proprioception.ogg" type="video/ogg"
   4.192 +	      preload="none" />
   4.193 +    </video>
   4.194 +    <br> <a href="http://youtu.be/JjdDmyM8b0w"> YouTube </a>
   4.195 +  </center>
   4.196 +  <p>Proprioception in a worm. The proprioceptive readout is
   4.197 +    in the upper left corner of the screen.</p>
   4.198 +</div>
   4.199 +#+end_html
   4.200 +
   4.201 +   A worm is trained in various actions such as sinusoidal movement,
   4.202 +   curling, flailing, and spinning by directly playing motor
   4.203 +   contractions while the worm "feels" the experience. These actions
   4.204 +   are recorded both as vectors of muscle tension, touch, and
   4.205 +   proprioceptive data, but also in higher level forms such as
   4.206 +   frequencies of the various contractions and a symbolic name for the
   4.207 +   action.
   4.208 +
   4.209 +   Then, the worm watches a video of another worm performing one of
   4.210 +   the actions, and must judge which action was performed. Normally
   4.211 +   this would be an extremely difficult problem, but the worm is able
   4.212 +   to greatly diminish the search space through sympathetic
   4.213 +   imagination. First, it creates an imagined copy of its body which
   4.214 +   it observes from a third person point of view. Then for each frame
   4.215 +   of the video, it maneuvers its simulated body to be in registration
   4.216 +   with the worm depicted in the video. The physical constraints
   4.217 +   imposed by the physics simulation greatly decrease the number of
   4.218 +   poses that have to be tried, making the search feasible. As the
   4.219 +   imaginary worm moves, it generates imaginary muscle tension and
   4.220 +   proprioceptive sensations. The worm determines the action not by
   4.221 +   vision, but by matching the imagined proprioceptive data with
   4.222 +   previous examples.
   4.223 +
   4.224 +   By using non-visual sensory data such as touch, the worms can also
   4.225 +   answer body related questions such as "did your head touch your
   4.226 +   tail?" and "did worm A touch worm B?"
   4.227 +
   4.228 +   The proprioceptive information used for action identification is
   4.229 +   body-centric, so only the registration step is dependent on point
   4.230 +   of view, not the identification step. Registration is not specific
   4.231 +   to any particular action. Thus, action identification can be
   4.232 +   divided into a point-of-view dependent generic registration step,
   4.233 +   and a action-specific step that is body-centered and invariant to
   4.234 +   point of view.
   4.235 +
   4.236 +** Stick Figure World
   4.237 +
   4.238 +   This environment is similar to Worm World, except the creatures are
   4.239 +   more complicated and the actions and questions more varied. It is
   4.240 +   an experiment to see how far imagination can go in interpreting
   4.241 +   actions.  
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/thesis/org/roadmap.org	Sun Mar 16 23:31:16 2014 -0400
     5.3 @@ -0,0 +1,189 @@
     5.4 +In order for this to be a reasonable thesis that I can be proud of,
     5.5 +what are the /minimum/ number of things I need to get done?
     5.6 +
     5.7 +
     5.8 +* worm OR hand registration
     5.9 +  - training from a few examples (2 to start out)
    5.10 +  - aligning the body with the scene
    5.11 +  - generating sensory data
    5.12 +  - matching previous labeled examples using dot-products or some
    5.13 +    other basic thing
    5.14 +  - showing that it works with different views
    5.15 +
    5.16 +* first draft
    5.17 +  - draft of thesis without bibliography or formatting
    5.18 +  - should have basic experiment and have full description of
    5.19 +    framework with code
    5.20 +  - review with Winston
    5.21 +  
    5.22 +* final draft
    5.23 +  - implement stretch goals from Winston if possible
    5.24 +  - complete final formatting and submit
    5.25 +
    5.26 +
    5.27 +
    5.28 +
    5.29 +* CORTEX
    5.30 +  DEADLINE: <2014-05-09 Fri>
    5.31 +  SHIT THAT'S IN 67 DAYS!!!
    5.32 +
    5.33 +** TODO program simple feature matching code for the worm's segments
    5.34 +   DEADLINE: <2014-03-11 Tue>
    5.35 +Subgoals:
    5.36 +*** DONE Get cortex working again, run tests, no jmonkeyengine updates
    5.37 +    CLOSED: [2014-03-03 Mon 22:07] SCHEDULED: <2014-03-03 Mon>
    5.38 +*** DONE get blender working again
    5.39 +    CLOSED: [2014-03-03 Mon 22:43] SCHEDULED: <2014-03-03 Mon>
    5.40 +*** DONE make sparce touch worm segment in blender
    5.41 +    CLOSED: [2014-03-03 Mon 23:16] SCHEDULED: <2014-03-03 Mon>
    5.42 +    CLOCK: [2014-03-03 Mon 22:44]--[2014-03-03 Mon 23:16] =>  0:32
    5.43 +*** DONE make multi-segment touch worm with touch sensors and display
    5.44 +    CLOSED: [2014-03-03 Mon 23:54] SCHEDULED: <2014-03-03 Mon>
    5.45 +    CLOCK: [2014-03-03 Mon 23:17]--[2014-03-03 Mon 23:54] =>  0:37
    5.46 +    
    5.47 +
    5.48 +*** DONE Make a worm wiggle and curl
    5.49 +    CLOSED: [2014-03-04 Tue 23:03] SCHEDULED: <2014-03-04 Tue>
    5.50 +*** TODO work on alignment for the worm (can "cheat")
    5.51 +    SCHEDULED: <2014-03-05 Wed>
    5.52 +
    5.53 +** First draft
    5.54 +   DEADLINE: <2014-03-14 Fri>
    5.55 +Subgoals:
    5.56 +*** Writeup new worm experiments.
    5.57 +*** Triage implementation code and get it into chapter form.
    5.58 +
    5.59 +
    5.60 +
    5.61 + 
    5.62 +
    5.63 +** for today
    5.64 +
    5.65 +- guided worm :: control the worm with the keyboard. Useful for
    5.66 +                 testing the body-centered recog scripts, and for
    5.67 +                 preparing a cool demo video.
    5.68 +
    5.69 +- body-centered recognition :: detect actions using hard coded
    5.70 +     body-centered scripts. 
    5.71 +
    5.72 +- cool demo video of the worm being moved and recognizing things ::
    5.73 +     will be a neat part of the thesis.
    5.74 +
    5.75 +- thesis export :: refactoring and organization of code so that it
    5.76 +                   spits out a thesis in addition to the web page.
    5.77 +
    5.78 +- video alignment :: analyze the frames of a video in order to align
    5.79 +     the worm. Requires body-centered recognition. Can "cheat".
    5.80 +
    5.81 +- smoother actions :: use debugging controls to directly influence the
    5.82 +     demo actions, and to generate recoginition procedures.
    5.83 +
    5.84 +- degenerate video demonstration :: show the system recognizing a
    5.85 +     curled worm from dead on. Crowning achievement of thesis.
    5.86 +
    5.87 +** Ordered from easiest to hardest
    5.88 +
    5.89 +Just report the positions of everything. I don't think that this
    5.90 +necessairly shows anything usefull.
    5.91 +
    5.92 +Worm-segment vision -- you initialize a view of the worm, but instead
    5.93 +of pixels you use labels via ray tracing. Has the advantage of still
    5.94 +allowing for visual occlusion, but reliably identifies the objects,
    5.95 +even without rainbow coloring. You can code this as an image. 
    5.96 +
    5.97 +Same as above, except just with worm/non-worm labels.
    5.98 +
    5.99 +Color code each worm segment and then recognize them using blob
   5.100 +detectors. Then you solve for the perspective and the action
   5.101 +simultaneously.
   5.102 +
   5.103 +The entire worm can be colored the same, high contrast color against a
   5.104 +nearly black background.
   5.105 +
   5.106 +"Rooted" vision. You give the exact coordinates of ONE piece of the
   5.107 +worm, but the algorithm figures out the rest.
   5.108 +
   5.109 +More rooted vision -- start off the entire worm with one posistion.
   5.110 +
   5.111 +The right way to do alignment is to use motion over multiple frames to
   5.112 +snap individual pieces of the model into place sharing and
   5.113 +propragating the individual alignments over the whole model. We also
   5.114 +want to limit the alignment search to just those actions we are
   5.115 +prepared to identify. This might mean that I need some small "micro
   5.116 +actions" such as the individual movements of the worm pieces.
   5.117 +
   5.118 +Get just the centers of each segment projected onto the imaging
   5.119 +plane. (best so far).
   5.120 +
   5.121 +
   5.122 +Repertoire of actions  +  video frames -->
   5.123 +   directed multi-frame-search alg
   5.124 +
   5.125 +
   5.126 +
   5.127 +
   5.128 +
   5.129 +
   5.130 +!! Could also have a bounding box around the worm provided by
   5.131 +filtering the worm/non-worm render, and use bbbgs. As a bonus, I get
   5.132 +to include bbbgs in my thesis! Could finally do that recursive things
   5.133 +where I make bounding boxes be those things that give results that
   5.134 +give good bounding boxes. If I did this I could use a disruptive
   5.135 +pattern on the worm.
   5.136 +
   5.137 +Re imagining using default textures is very simple for this system,
   5.138 +but hard for others.
   5.139 +
   5.140 +
   5.141 +Want to demonstrate, at minimum, alignment of some model of the worm
   5.142 +to the video, and a lookup of the action by simulated perception.
   5.143 +
   5.144 +note: the purple/white points is a very beautiful texture, because
   5.145 +when it moves slightly, the white dots look like they're
   5.146 +twinkling. Would look even better if it was a darker purple. Also
   5.147 +would look better more spread out.
   5.148 +
   5.149 +
   5.150 +embed assumption of one frame of view, search by moving around in
   5.151 +simulated world.
   5.152 +
   5.153 +Allowed to limit search by setting limits to a hemisphere around the
   5.154 +imagined worm! This limits scale also.
   5.155 +
   5.156 +
   5.157 +
   5.158 +
   5.159 +
   5.160 +!! Limited search with worm/non-worm rendering. 
   5.161 +How much inverse kinematics do we have to do?
   5.162 +What about cached (allowed state-space) paths, derived from labeled
   5.163 +training. You have to lead from one to another.
   5.164 +
   5.165 +What about initial state? Could start the input videos at a specific
   5.166 +state, then just match that explicitly.
   5.167 +
   5.168 +!! The training doesn't have to be labeled -- you can just move around
   5.169 +for a while!!
   5.170 +
   5.171 +!! Limited search with motion based alignment.
   5.172 +
   5.173 +
   5.174 +
   5.175 +
   5.176 +"play arounds" can establish a chain of linked sensoriums. Future
   5.177 +matches must fall into one of the already experienced things, and once
   5.178 +they do, it greatly limits the things that are possible in the future.
   5.179 +
   5.180 +
   5.181 +frame differences help to detect muscle exertion.
   5.182 +
   5.183 +Can try to match on a few "representative" frames. Can also just have
   5.184 +a few "bodies" in various states which we try to match.
   5.185 +
   5.186 +
   5.187 +
   5.188 +Paths through state-space have the exact same signature as
   5.189 +simulation. BUT, these can be searched in parallel and don't interfere
   5.190 +with each other.
   5.191 +
   5.192 +