diff thesis/org/first-chapter.html @ 460:763d13f77e03

merge in laptop changes.
author Robert McIntyre <rlm@mit.edu>
date Thu, 27 Mar 2014 17:57:01 -0400
parents 5205535237fb
children
line wrap: on
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/thesis/org/first-chapter.html	Thu Mar 27 17:57:01 2014 -0400
     1.3 @@ -0,0 +1,455 @@
     1.4 +<?xml version="1.0" encoding="utf-8"?>
     1.5 +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
     1.6 +               "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
     1.7 +<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
     1.8 +<head>
     1.9 +<title><code>CORTEX</code></title>
    1.10 +<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
    1.11 +<meta name="title" content="<code>CORTEX</code>"/>
    1.12 +<meta name="generator" content="Org-mode"/>
    1.13 +<meta name="generated" content="2013-11-07 04:21:29 EST"/>
    1.14 +<meta name="author" content="Robert McIntyre"/>
    1.15 +<meta name="description" content="Using embodied AI to facilitate Artificial Imagination."/>
    1.16 +<meta name="keywords" content="AI, clojure, embodiment"/>
    1.17 +<style type="text/css">
    1.18 + <!--/*--><![CDATA[/*><!--*/
    1.19 +  html { font-family: Times, serif; font-size: 12pt; }
    1.20 +  .title  { text-align: center; }
    1.21 +  .todo   { color: red; }
    1.22 +  .done   { color: green; }
    1.23 +  .tag    { background-color: #add8e6; font-weight:normal }
    1.24 +  .target { }
    1.25 +  .timestamp { color: #bebebe; }
    1.26 +  .timestamp-kwd { color: #5f9ea0; }
    1.27 +  .right  {margin-left:auto; margin-right:0px;  text-align:right;}
    1.28 +  .left   {margin-left:0px;  margin-right:auto; text-align:left;}
    1.29 +  .center {margin-left:auto; margin-right:auto; text-align:center;}
    1.30 +  p.verse { margin-left: 3% }
    1.31 +  pre {
    1.32 +	border: 1pt solid #AEBDCC;
    1.33 +	background-color: #F3F5F7;
    1.34 +	padding: 5pt;
    1.35 +	font-family: courier, monospace;
    1.36 +        font-size: 90%;
    1.37 +        overflow:auto;
    1.38 +  }
    1.39 +  table { border-collapse: collapse; }
    1.40 +  td, th { vertical-align: top;  }
    1.41 +  th.right  { text-align:center;  }
    1.42 +  th.left   { text-align:center;   }
    1.43 +  th.center { text-align:center; }
    1.44 +  td.right  { text-align:right;  }
    1.45 +  td.left   { text-align:left;   }
    1.46 +  td.center { text-align:center; }
    1.47 +  dt { font-weight: bold; }
    1.48 +  div.figure { padding: 0.5em; }
    1.49 +  div.figure p { text-align: center; }
    1.50 +  div.inlinetask {
    1.51 +    padding:10px;
    1.52 +    border:2px solid gray;
    1.53 +    margin:10px;
    1.54 +    background: #ffffcc;
    1.55 +  }
    1.56 +  textarea { overflow-x: auto; }
    1.57 +  .linenr { font-size:smaller }
    1.58 +  .code-highlighted {background-color:#ffff00;}
    1.59 +  .org-info-js_info-navigation { border-style:none; }
    1.60 +  #org-info-js_console-label { font-size:10px; font-weight:bold;
    1.61 +                               white-space:nowrap; }
    1.62 +  .org-info-js_search-highlight {background-color:#ffff00; color:#000000;
    1.63 +                                 font-weight:bold; }
    1.64 +  /*]]>*/-->
    1.65 +</style>
    1.66 +<script type="text/javascript">var _gaq = _gaq || [];_gaq.push(['_setAccount', 'UA-31261312-1']);_gaq.push(['_trackPageview']);(function() {var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);})();</script><link rel="stylesheet" type="text/css" href="../../aurellem/css/argentum.css" />
    1.67 +<script type="text/javascript">
    1.68 +<!--/*--><![CDATA[/*><!--*/
    1.69 + function CodeHighlightOn(elem, id)
    1.70 + {
    1.71 +   var target = document.getElementById(id);
    1.72 +   if(null != target) {
    1.73 +     elem.cacheClassElem = elem.className;
    1.74 +     elem.cacheClassTarget = target.className;
    1.75 +     target.className = "code-highlighted";
    1.76 +     elem.className   = "code-highlighted";
    1.77 +   }
    1.78 + }
    1.79 + function CodeHighlightOff(elem, id)
    1.80 + {
    1.81 +   var target = document.getElementById(id);
    1.82 +   if(elem.cacheClassElem)
    1.83 +     elem.className = elem.cacheClassElem;
    1.84 +   if(elem.cacheClassTarget)
    1.85 +     target.className = elem.cacheClassTarget;
    1.86 + }
    1.87 +/*]]>*///-->
    1.88 +</script>
    1.89 +
    1.90 +</head>
    1.91 +<body>
    1.92 +
    1.93 +
    1.94 +<div id="content">
    1.95 +<h1 class="title"><code>CORTEX</code></h1>
    1.96 +
    1.97 +
    1.98 +<div class="header">
    1.99 +  <div class="float-right">	
   1.100 +    <!-- 
   1.101 +    <form>
   1.102 +      <input type="text"/><input type="submit" value="search the blog &raquo;"/> 
   1.103 +    </form>
   1.104 +    -->
   1.105 +  </div>
   1.106 +
   1.107 +  <h1>aurellem <em>&#x2609;</em></h1>
   1.108 +  <ul class="nav">
   1.109 +    <li><a href="/">read the blog &raquo;</a></li>
   1.110 +    <!-- li><a href="#">learn about us &raquo;</a></li-->
   1.111 +  </ul>
   1.112 +</div>
   1.113 +
   1.114 +<div class="author">Written by <author>Robert McIntyre</author></div>
   1.115 +
   1.116 +
   1.117 +
   1.118 +
   1.119 +
   1.120 +
   1.121 +
   1.122 +<div id="outline-container-1" class="outline-2">
   1.123 +<h2 id="sec-1">Artificial Imagination</h2>
   1.124 +<div class="outline-text-2" id="text-1">
   1.125 +
   1.126 +
   1.127 +<p>
   1.128 +  Imagine watching a video of someone skateboarding. When you watch
   1.129 +  the video, you can imagine yourself skateboarding, and your
   1.130 +  knowledge of the human body and its dynamics guides your
   1.131 +  interpretation of the scene. For example, even if the skateboarder
   1.132 +  is partially occluded, you can infer the positions of his arms and
   1.133 +  body from your own knowledge of how your body would be positioned if
   1.134 +  you were skateboarding. If the skateboarder suffers an accident, you
   1.135 +  wince in sympathy, imagining the pain your own body would experience
   1.136 +  if it were in the same situation. This empathy with other people
   1.137 +  guides our understanding of whatever they are doing because it is a
   1.138 +  powerful constraint on what is probable and possible. In order to
   1.139 +  make use of this powerful empathy constraint, I need a system that
   1.140 +  can generate and make sense of sensory data from the many different
   1.141 +  senses that humans possess. The two key proprieties of such a system
   1.142 +  are <i>embodiment</i> and <i>imagination</i>.
   1.143 +</p>
   1.144 +
   1.145 +</div>
   1.146 +
   1.147 +<div id="outline-container-1-1" class="outline-3">
   1.148 +<h3 id="sec-1-1">What is imagination?</h3>
   1.149 +<div class="outline-text-3" id="text-1-1">
   1.150 +
   1.151 +
   1.152 +<p>
   1.153 +   One kind of imagination is <i>sympathetic</i> imagination: you imagine
   1.154 +   yourself in the position of something/someone you are
   1.155 +   observing. This type of imagination comes into play when you follow
   1.156 +   along visually when watching someone perform actions, or when you
   1.157 +   sympathetically grimace when someone hurts themselves. This type of
   1.158 +   imagination uses the constraints you have learned about your own
   1.159 +   body to highly constrain the possibilities in whatever you are
   1.160 +   seeing. It uses all your senses to including your senses of touch,
   1.161 +   proprioception, etc. Humans are flexible when it comes to "putting
   1.162 +   themselves in another's shoes," and can sympathetically understand
   1.163 +   not only other humans, but entities ranging animals to cartoon
   1.164 +   characters to <a href="http://www.youtube.com/watch?v=0jz4HcwTQmU">single dots</a> on a screen!
   1.165 +</p>
   1.166 +<p>
   1.167 +   Another kind of imagination is <i>predictive</i> imagination: you
   1.168 +   construct scenes in your mind that are not entirely related to
   1.169 +   whatever you are observing, but instead are predictions of the
   1.170 +   future or simply flights of fancy. You use this type of imagination
   1.171 +   to plan out multi-step actions, or play out dangerous situations in
   1.172 +   your mind so as to avoid messing them up in reality.
   1.173 +</p>
   1.174 +<p>
   1.175 +   Of course, sympathetic and predictive imagination blend into each
   1.176 +   other and are not completely separate concepts. One dimension along
   1.177 +   which you can distinguish types of imagination is dependence on raw
   1.178 +   sense data. Sympathetic imagination is highly constrained by your
   1.179 +   senses, while predictive imagination can be more or less dependent
   1.180 +   on your senses depending on how far ahead you imagine. Daydreaming
   1.181 +   is an extreme form of predictive imagination that wanders through
   1.182 +   different possibilities without concern for whether they are
   1.183 +   related to whatever is happening in reality.
   1.184 +</p>
   1.185 +<p>
   1.186 +   For this thesis, I will mostly focus on sympathetic imagination and
   1.187 +   the constraint it provides for understanding sensory data.
   1.188 +</p>
   1.189 +</div>
   1.190 +
   1.191 +</div>
   1.192 +
   1.193 +<div id="outline-container-1-2" class="outline-3">
   1.194 +<h3 id="sec-1-2">What problems can imagination solve?</h3>
   1.195 +<div class="outline-text-3" id="text-1-2">
   1.196 +
   1.197 +
   1.198 +<p>
   1.199 +   Consider a video of a cat drinking some water.
   1.200 +</p>
   1.201 +
   1.202 +<div class="figure">
   1.203 +<p><img src="../images/cat-drinking.jpg"  alt="../images/cat-drinking.jpg" /></p>
   1.204 +<p>A cat drinking some water. Identifying this action is beyond the state of the art for computers.</p>
   1.205 +</div>
   1.206 +
   1.207 +<p>
   1.208 +   It is currently impossible for any computer program to reliably
   1.209 +   label such an video as "drinking". I think humans are able to label
   1.210 +   such video as "drinking" because they imagine <i>themselves</i> as the
   1.211 +   cat, and imagine putting their face up against a stream of water
   1.212 +   and sticking out their tongue. In that imagined world, they can
   1.213 +   feel the cool water hitting their tongue, and feel the water
   1.214 +   entering their body, and are able to recognize that <i>feeling</i> as
   1.215 +   drinking. So, the label of the action is not really in the pixels
   1.216 +   of the image, but is found clearly in a simulation inspired by
   1.217 +   those pixels. An imaginative system, having been trained on
   1.218 +   drinking and non-drinking examples and learning that the most
   1.219 +   important component of drinking is the feeling of water sliding
   1.220 +   down one's throat, would analyze a video of a cat drinking in the
   1.221 +   following manner:
   1.222 +</p>
   1.223 +<ul>
   1.224 +<li>Create a physical model of the video by putting a "fuzzy" model
   1.225 +     of its own body in place of the cat. Also, create a simulation of
   1.226 +     the stream of water.
   1.227 +
   1.228 +</li>
   1.229 +<li>Play out this simulated scene and generate imagined sensory
   1.230 +     experience. This will include relevant muscle contractions, a
   1.231 +     close up view of the stream from the cat's perspective, and most
   1.232 +     importantly, the imagined feeling of water entering the mouth.
   1.233 +
   1.234 +</li>
   1.235 +<li>The action is now easily identified as drinking by the sense of
   1.236 +     taste alone. The other senses (such as the tongue moving in and
   1.237 +     out) help to give plausibility to the simulated action. Note that
   1.238 +     the sense of vision, while critical in creating the simulation,
   1.239 +     is not critical for identifying the action from the simulation.
   1.240 +</li>
   1.241 +</ul>
   1.242 +
   1.243 +
   1.244 +<p>
   1.245 +   More generally, I expect imaginative systems to be particularly
   1.246 +   good at identifying embodied actions in videos.
   1.247 +</p>
   1.248 +</div>
   1.249 +</div>
   1.250 +
   1.251 +</div>
   1.252 +
   1.253 +<div id="outline-container-2" class="outline-2">
   1.254 +<h2 id="sec-2">Cortex</h2>
   1.255 +<div class="outline-text-2" id="text-2">
   1.256 +
   1.257 +
   1.258 +<p>
   1.259 +  The previous example involves liquids, the sense of taste, and
   1.260 +  imagining oneself as a cat. For this thesis I constrain myself to
   1.261 +  simpler, more easily digitizable senses and situations.
   1.262 +</p>
   1.263 +<p>
   1.264 +  My system, <code>Cortex</code> performs imagination in two different simplified
   1.265 +  worlds: <i>worm world</i> and <i>stick figure world</i>. In each of these
   1.266 +  worlds, entities capable of imagination recognize actions by
   1.267 +  simulating the experience from their own perspective, and then
   1.268 +  recognizing the action from a database of examples.
   1.269 +</p>
   1.270 +<p>
   1.271 +  In order to serve as a framework for experiments in imagination,
   1.272 +  <code>Cortex</code> requires simulated bodies, worlds, and senses like vision,
   1.273 +  hearing, touch, proprioception, etc.
   1.274 +</p>
   1.275 +
   1.276 +</div>
   1.277 +
   1.278 +<div id="outline-container-2-1" class="outline-3">
   1.279 +<h3 id="sec-2-1">A Video Game Engine takes care of some of the groundwork</h3>
   1.280 +<div class="outline-text-3" id="text-2-1">
   1.281 +
   1.282 +
   1.283 +<p>
   1.284 +   When it comes to simulation environments, the engines used to
   1.285 +   create the worlds in video games offer top-notch physics and
   1.286 +   graphics support. These engines also have limited support for
   1.287 +   creating cameras and rendering 3D sound, which can be repurposed
   1.288 +   for vision and hearing respectively. Physics collision detection
   1.289 +   can be expanded to create a sense of touch.
   1.290 +</p>
   1.291 +<p>   
   1.292 +   jMonkeyEngine3 is one such engine for creating video games in
   1.293 +   Java. It uses OpenGL to render to the screen and uses screengraphs
   1.294 +   to avoid drawing things that do not appear on the screen. It has an
   1.295 +   active community and several games in the pipeline. The engine was
   1.296 +   not built to serve any particular game but is instead meant to be
   1.297 +   used for any 3D game. I chose jMonkeyEngine3 it because it had the
   1.298 +   most features out of all the open projects I looked at, and because
   1.299 +   I could then write my code in Clojure, an implementation of LISP
   1.300 +   that runs on the JVM.
   1.301 +</p>
   1.302 +</div>
   1.303 +
   1.304 +</div>
   1.305 +
   1.306 +<div id="outline-container-2-2" class="outline-3">
   1.307 +<h3 id="sec-2-2"><code>CORTEX</code> Extends jMonkeyEngine3 to implement rich senses</h3>
   1.308 +<div class="outline-text-3" id="text-2-2">
   1.309 +
   1.310 +
   1.311 +<p>
   1.312 +   Using the game-making primitives provided by jMonkeyEngine3, I have
   1.313 +   constructed every major human sense except for smell and
   1.314 +   taste. <code>Cortex</code> also provides an interface for creating creatures
   1.315 +   in Blender, a 3D modeling environment, and then "rigging" the
   1.316 +   creatures with senses using 3D annotations in Blender. A creature
   1.317 +   can have any number of senses, and there can be any number of
   1.318 +   creatures in a simulation.
   1.319 +</p>
   1.320 +<p>   
   1.321 +   The senses available in <code>Cortex</code> are:
   1.322 +</p>
   1.323 +<ul>
   1.324 +<li><a href="../../cortex/html/vision.html">Vision</a>
   1.325 +</li>
   1.326 +<li><a href="../../cortex/html/hearing.html">Hearing</a>
   1.327 +</li>
   1.328 +<li><a href="../../cortex/html/touch.html">Touch</a>
   1.329 +</li>
   1.330 +<li><a href="../../cortex/html/proprioception.html">Proprioception</a>
   1.331 +</li>
   1.332 +<li><a href="../../cortex/html/movement.html">Muscle Tension</a>
   1.333 +</li>
   1.334 +</ul>
   1.335 +
   1.336 +
   1.337 +</div>
   1.338 +</div>
   1.339 +
   1.340 +</div>
   1.341 +
   1.342 +<div id="outline-container-3" class="outline-2">
   1.343 +<h2 id="sec-3">A roadmap for <code>Cortex</code> experiments</h2>
   1.344 +<div class="outline-text-2" id="text-3">
   1.345 +
   1.346 +
   1.347 +
   1.348 +</div>
   1.349 +
   1.350 +<div id="outline-container-3-1" class="outline-3">
   1.351 +<h3 id="sec-3-1">Worm World</h3>
   1.352 +<div class="outline-text-3" id="text-3-1">
   1.353 +
   1.354 +
   1.355 +<p>
   1.356 +   Worms in <code>Cortex</code> are segmented creatures which vary in length and
   1.357 +   number of segments, and have the senses of vision, proprioception,
   1.358 +   touch, and muscle tension.
   1.359 +</p>
   1.360 +
   1.361 +<div class="figure">
   1.362 +<p><img src="../images/finger-UV.png" width=755 alt="../images/finger-UV.png" /></p>
   1.363 +<p>This is the tactile-sensor-profile for the upper segment of a worm. It defines regions of high touch sensitivity (where there are many white pixels) and regions of low sensitivity (where white pixels are sparse).</p>
   1.364 +</div>
   1.365 +
   1.366 +
   1.367 +
   1.368 +
   1.369 +<div class="figure">
   1.370 +  <center>
   1.371 +    <video controls="controls" width="550">
   1.372 +      <source src="../video/worm-touch.ogg" type="video/ogg"
   1.373 +              preload="none" />
   1.374 +    </video>
   1.375 +    <br> <a href="http://youtu.be/RHx2wqzNVcU"> YouTube </a>
   1.376 +  </center>
   1.377 +  <p>The worm responds to touch.</p>
   1.378 +</div>
   1.379 +
   1.380 +<div class="figure">
   1.381 +  <center>
   1.382 +    <video controls="controls" width="550">
   1.383 +      <source src="../video/test-proprioception.ogg" type="video/ogg"
   1.384 +              preload="none" />
   1.385 +    </video>
   1.386 +    <br> <a href="http://youtu.be/JjdDmyM8b0w"> YouTube </a>
   1.387 +  </center>
   1.388 +  <p>Proprioception in a worm. The proprioceptive readout is
   1.389 +    in the upper left corner of the screen.</p>
   1.390 +</div>
   1.391 +
   1.392 +<p>
   1.393 +   A worm is trained in various actions such as sinusoidal movement,
   1.394 +   curling, flailing, and spinning by directly playing motor
   1.395 +   contractions while the worm "feels" the experience. These actions
   1.396 +   are recorded both as vectors of muscle tension, touch, and
   1.397 +   proprioceptive data, but also in higher level forms such as
   1.398 +   frequencies of the various contractions and a symbolic name for the
   1.399 +   action.
   1.400 +</p>
   1.401 +<p>
   1.402 +   Then, the worm watches a video of another worm performing one of
   1.403 +   the actions, and must judge which action was performed. Normally
   1.404 +   this would be an extremely difficult problem, but the worm is able
   1.405 +   to greatly diminish the search space through sympathetic
   1.406 +   imagination. First, it creates an imagined copy of its body which
   1.407 +   it observes from a third person point of view. Then for each frame
   1.408 +   of the video, it maneuvers its simulated body to be in registration
   1.409 +   with the worm depicted in the video. The physical constraints
   1.410 +   imposed by the physics simulation greatly decrease the number of
   1.411 +   poses that have to be tried, making the search feasible. As the
   1.412 +   imaginary worm moves, it generates imaginary muscle tension and
   1.413 +   proprioceptive sensations. The worm determines the action not by
   1.414 +   vision, but by matching the imagined proprioceptive data with
   1.415 +   previous examples.
   1.416 +</p>
   1.417 +<p>
   1.418 +   By using non-visual sensory data such as touch, the worms can also
   1.419 +   answer body related questions such as "did your head touch your
   1.420 +   tail?" and "did worm A touch worm B?"
   1.421 +</p>
   1.422 +<p>
   1.423 +   The proprioceptive information used for action identification is
   1.424 +   body-centric, so only the registration step is dependent on point
   1.425 +   of view, not the identification step. Registration is not specific
   1.426 +   to any particular action. Thus, action identification can be
   1.427 +   divided into a point-of-view dependent generic registration step,
   1.428 +   and a action-specific step that is body-centered and invariant to
   1.429 +   point of view.
   1.430 +</p>
   1.431 +</div>
   1.432 +
   1.433 +</div>
   1.434 +
   1.435 +<div id="outline-container-3-2" class="outline-3">
   1.436 +<h3 id="sec-3-2">Stick Figure World</h3>
   1.437 +<div class="outline-text-3" id="text-3-2">
   1.438 +
   1.439 +
   1.440 +<p>
   1.441 +   This environment is similar to Worm World, except the creatures are
   1.442 +   more complicated and the actions and questions more varied. It is
   1.443 +   an experiment to see how far imagination can go in interpreting
   1.444 +   actions.  
   1.445 +</p></div>
   1.446 +</div>
   1.447 +</div>
   1.448 +</div>
   1.449 +
   1.450 +<div id="postamble">
   1.451 +<p class="date">Date: 2013-11-07 04:21:29 EST</p>
   1.452 +<p class="author">Author: Robert McIntyre</p>
   1.453 +<p class="creator">Org version 7.7 with Emacs version 24</p>
   1.454 +<a href="http://validator.w3.org/check?uri=referer">Validate XHTML 1.0</a>
   1.455 +
   1.456 +</div>
   1.457 +</body>
   1.458 +</html>