diff src/rlm/pikasemechu.clj @ 0:78a630e650d2

initial import
author Robert McIntyre <rlm@mit.edu>
date Tue, 18 Oct 2011 00:57:08 -0700
parents
children
line wrap: on
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/rlm/pikasemechu.clj	Tue Oct 18 00:57:08 2011 -0700
     1.3 @@ -0,0 +1,136 @@
     1.4 +(ns rlm.pikasemechu
     1.5 +  (:use rlm.ns-rlm))
     1.6 +(rlm.ns-rlm/ns-clone rlm.light-base)
     1.7 +(use 'clojure.java.io)
     1.8 +(use 'rlm.sanitize-file)
     1.9 +(import org.htmlcleaner.HtmlCleaner)
    1.10 +(import org.htmlcleaner.TagNode)
    1.11 +(import java.io.File)
    1.12 +(import java.net.URL)
    1.13 +
    1.14 +(def first-comic (URL. "http://pikasemechu.smackjeeves.com/comics/165598/cover/"))
    1.15 +(def base-directory (file-str "~/pikasemechu"))
    1.16 +
    1.17 +(defn tags-by-name
    1.18 +  [#^TagNode node #^String element]
    1.19 +  (seq (.getElementListByName node element true)))
    1.20 +
    1.21 +(defn-memo parse
    1.22 +  "parse a web page using HtmlCleaner"
    1.23 +  [#^URL url]
    1.24 +  (.clean (HtmlCleaner.) (input-stream url)))
    1.25 +
    1.26 +(defn attributes
    1.27 +  "get a hash map of the attributes of an element"
    1.28 +  [#^TagNode node]
    1.29 +  (into {} (.getAttributes node)))
    1.30 +
    1.31 +(defn next-url
    1.32 +  "get the next url to visit from the current page we are visiting"
    1.33 +  [#^URL url]
    1.34 +  (let
    1.35 +      [;; extract all links 
    1.36 +       links (tags-by-name (parse url) "a")
    1.37 +       ;; extract the "Next" link
    1.38 +       next-node (first (filter  #(re-matches #".*Next.*" (.getText %)) links))
    1.39 +       next-page (URL. ((attributes next-node) "href"))]
    1.40 +    ;; return nil if this is the last page
    1.41 +    (if (= next-page url) nil next-page)))
    1.42 +
    1.43 +(def poke-first-comic
    1.44 +     (URL. "http://www.mangareader.net/458-28199-1/pokemon-adventures/chapter-1.html"))
    1.45 +
    1.46 +(def poke-base-directory (file-str "~/poke"))
    1.47 +
    1.48 +(defn poke-next-url
    1.49 +  "get the next url to visit from the current page we are visiting"
    1.50 +  [#^URL url]
    1.51 +  (let
    1.52 +      [;; extract all links 
    1.53 +       links (tags-by-name (parse url) "a")
    1.54 +       ;; extract the "Next" link
    1.55 +       next-node (first (filter  #(re-matches #".*Next.*" (.getText %)) links))
    1.56 +       next-page
    1.57 +
    1.58 +       (URL. (str "http://www.mangareader.net" ((attributes next-node) "href")))]
    1.59 +    ;; return nil if this is the last page
    1.60 +    (if (= next-page url) nil next-page)))
    1.61 +
    1.62 +(defn poke-comic-img
    1.63 +  "get the file which we want to download from the webpage"
    1.64 +  [#^URL url]
    1.65 +  (let [;; parse the HTML
    1.66 +	node (parse url)
    1.67 +	;; grab all img tags
    1.68 +	container (first (filter (fn [node] (= ((attributes node) "id")
    1.69 +					       "imgholder"))
    1.70 +				   (tags-by-name node "div")))]
    1.71 +	;; extract the comic
    1.72 +    (URL.
    1.73 +     ((attributes
    1.74 +       (last (.getChildren
    1.75 +	      (first
    1.76 +	       (tags-by-name container "a"))))) "src"))))
    1.77 +  
    1.78 +(defn poke-comic-name []
    1.79 +  (let [a (atom 0)]
    1.80 +    (fn
    1.81 +      [url]
    1.82 +      (File. poke-base-directory (str (swap! a inc) ".jpg")))))
    1.83 +  
    1.84 +(defn comic-img
    1.85 +  "get the file which we want to download from the webpage"
    1.86 +  [#^URL url]
    1.87 +  (let [;; parse the HTML
    1.88 +	node (parse url)
    1.89 +	;; grab all img tags
    1.90 +	images (map attributes (tags-by-name node "img"))
    1.91 +	;; extract the comic
    1.92 +	comic (first (filter #(= "comic_image" (% "id")) images))]
    1.93 +    (URL. (comic "src"))))
    1.94 +	      
    1.95 +(defn comic-name
    1.96 +  "get the comic's name"
    1.97 +  [#^URL url]
    1.98 +  (let [;; parse the HTML
    1.99 +	node (parse url)
   1.100 +	;; grab the select box
   1.101 +	select (first (tags-by-name node "select"))
   1.102 +	;; extract the selected name which is the name of the comic
   1.103 +	selected (first (filter #(= "selected" ((attributes %) "selected"))
   1.104 +				(tags-by-name node "option")))]
   1.105 +    (File. base-directory
   1.106 +	   (sanitize-file-name (str (.getText selected) ".jpg")))))
   1.107 +
   1.108 +(defn download-url
   1.109 +  [#^URL url #^File file]
   1.110 +  (if (.exists file)
   1.111 +    (println (.getCanonicalPath file) " already exists.")
   1.112 +    (do 
   1.113 +      (println "Copying " (.getPath url) " to " (.getCanonicalPath file))
   1.114 +      (copy (input-stream url) file)) ))
   1.115 +
   1.116 +(defn download-comic
   1.117 +  ([first-comic next-url comic-img comic-name n]
   1.118 +  ;; create a lazy sequence of web urls by chaining next-url
   1.119 +  (let [urls (take-while (comp not nil?) (iterate next-url first-comic))]
   1.120 +    (dorun (take n (map download-url
   1.121 +			(map comic-img urls)
   1.122 +			(map comic-name urls)))))))
   1.123 +
   1.124 +;; first stab at making it more general
   1.125 +(defn download-comic-archive
   1.126 +  [comic-img comic-name comics]
   1.127 +  (dorun (map download-url
   1.128 +	      (map comic-img comics)
   1.129 +	      (map comic-name comics))))
   1.130 +
   1.131 +;; (defn download-comic-chain
   1.132 +;;   [comic-img comic-name next-url first-comic]
   1.133 +;;   [comics (take-while (comp not nil?) (iterate next-url first-comic))]
   1.134 +;;        (download-comics comics comic-img comic-name))
   1.135 +
   1.136 +(defn tests []
   1.137 +  (println (comic-name first-comic))
   1.138 +  (println (comic-img first-comic))
   1.139 +  (println (next-url first-comic)))