Mercurial > rlm
view src/rlm/pikasemechu.clj @ 5:fca75c0e8f40
added stories.clj
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Thu, 01 Mar 2012 05:47:37 -0700 |
parents | 78a630e650d2 |
children |
line wrap: on
line source
1 (ns rlm.pikasemechu2 (:use rlm.ns-rlm))3 (rlm.ns-rlm/ns-clone rlm.light-base)4 (use 'clojure.java.io)5 (use 'rlm.sanitize-file)6 (import org.htmlcleaner.HtmlCleaner)7 (import org.htmlcleaner.TagNode)8 (import java.io.File)9 (import java.net.URL)11 (def first-comic (URL. "http://pikasemechu.smackjeeves.com/comics/165598/cover/"))12 (def base-directory (file-str "~/pikasemechu"))14 (defn tags-by-name15 [#^TagNode node #^String element]16 (seq (.getElementListByName node element true)))18 (defn-memo parse19 "parse a web page using HtmlCleaner"20 [#^URL url]21 (.clean (HtmlCleaner.) (input-stream url)))23 (defn attributes24 "get a hash map of the attributes of an element"25 [#^TagNode node]26 (into {} (.getAttributes node)))28 (defn next-url29 "get the next url to visit from the current page we are visiting"30 [#^URL url]31 (let32 [;; extract all links33 links (tags-by-name (parse url) "a")34 ;; extract the "Next" link35 next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links))36 next-page (URL. ((attributes next-node) "href"))]37 ;; return nil if this is the last page38 (if (= next-page url) nil next-page)))40 (def poke-first-comic41 (URL. "http://www.mangareader.net/458-28199-1/pokemon-adventures/chapter-1.html"))43 (def poke-base-directory (file-str "~/poke"))45 (defn poke-next-url46 "get the next url to visit from the current page we are visiting"47 [#^URL url]48 (let49 [;; extract all links50 links (tags-by-name (parse url) "a")51 ;; extract the "Next" link52 next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links))53 next-page55 (URL. (str "http://www.mangareader.net" ((attributes next-node) "href")))]56 ;; return nil if this is the last page57 (if (= next-page url) nil next-page)))59 (defn poke-comic-img60 "get the file which we want to download from the webpage"61 [#^URL url]62 (let [;; parse the HTML63 node (parse url)64 ;; grab all img tags65 container (first (filter (fn [node] (= ((attributes node) "id")66 "imgholder"))67 (tags-by-name node "div")))]68 ;; extract the comic69 (URL.70 ((attributes71 (last (.getChildren72 (first73 (tags-by-name container "a"))))) "src"))))75 (defn poke-comic-name []76 (let [a (atom 0)]77 (fn78 [url]79 (File. poke-base-directory (str (swap! a inc) ".jpg")))))81 (defn comic-img82 "get the file which we want to download from the webpage"83 [#^URL url]84 (let [;; parse the HTML85 node (parse url)86 ;; grab all img tags87 images (map attributes (tags-by-name node "img"))88 ;; extract the comic89 comic (first (filter #(= "comic_image" (% "id")) images))]90 (URL. (comic "src"))))92 (defn comic-name93 "get the comic's name"94 [#^URL url]95 (let [;; parse the HTML96 node (parse url)97 ;; grab the select box98 select (first (tags-by-name node "select"))99 ;; extract the selected name which is the name of the comic100 selected (first (filter #(= "selected" ((attributes %) "selected"))101 (tags-by-name node "option")))]102 (File. base-directory103 (sanitize-file-name (str (.getText selected) ".jpg")))))105 (defn download-url106 [#^URL url #^File file]107 (if (.exists file)108 (println (.getCanonicalPath file) " already exists.")109 (do110 (println "Copying " (.getPath url) " to " (.getCanonicalPath file))111 (copy (input-stream url) file)) ))113 (defn download-comic114 ([first-comic next-url comic-img comic-name n]115 ;; create a lazy sequence of web urls by chaining next-url116 (let [urls (take-while (comp not nil?) (iterate next-url first-comic))]117 (dorun (take n (map download-url118 (map comic-img urls)119 (map comic-name urls)))))))121 ;; first stab at making it more general122 (defn download-comic-archive123 [comic-img comic-name comics]124 (dorun (map download-url125 (map comic-img comics)126 (map comic-name comics))))128 ;; (defn download-comic-chain129 ;; [comic-img comic-name next-url first-comic]130 ;; [comics (take-while (comp not nil?) (iterate next-url first-comic))]131 ;; (download-comics comics comic-img comic-name))133 (defn tests []134 (println (comic-name first-comic))135 (println (comic-img first-comic))136 (println (next-url first-comic)))