rlm: src/rlm/pikasemechu.clj annotate

annotate src/rlm/pikasemechu.clj @ 0:78a630e650d2

initial import

author	Robert McIntyre <rlm@mit.edu>
date	Tue, 18 Oct 2011 00:57:08 -0700
parents
children

rev	line source
rlm@0	1 (ns rlm.pikasemechu
rlm@0	2 (:use rlm.ns-rlm))
rlm@0	3 (rlm.ns-rlm/ns-clone rlm.light-base)
rlm@0	4 (use 'clojure.java.io)
rlm@0	5 (use 'rlm.sanitize-file)
rlm@0	6 (import org.htmlcleaner.HtmlCleaner)
rlm@0	7 (import org.htmlcleaner.TagNode)
rlm@0	8 (import java.io.File)
rlm@0	9 (import java.net.URL)
rlm@0	10
rlm@0	11 (def first-comic (URL. "http://pikasemechu.smackjeeves.com/comics/165598/cover/"))
rlm@0	12 (def base-directory (file-str "~/pikasemechu"))
rlm@0	13
rlm@0	14 (defn tags-by-name
rlm@0	15 [#^TagNode node #^String element]
rlm@0	16 (seq (.getElementListByName node element true)))
rlm@0	17
rlm@0	18 (defn-memo parse
rlm@0	19 "parse a web page using HtmlCleaner"
rlm@0	20 [#^URL url]
rlm@0	21 (.clean (HtmlCleaner.) (input-stream url)))
rlm@0	22
rlm@0	23 (defn attributes
rlm@0	24 "get a hash map of the attributes of an element"
rlm@0	25 [#^TagNode node]
rlm@0	26 (into {} (.getAttributes node)))
rlm@0	27
rlm@0	28 (defn next-url
rlm@0	29 "get the next url to visit from the current page we are visiting"
rlm@0	30 [#^URL url]
rlm@0	31 (let
rlm@0	32 [;; extract all links
rlm@0	33 links (tags-by-name (parse url) "a")
rlm@0	34 ;; extract the "Next" link
rlm@0	35 next-node (first (filter #(re-matches #".Next." (.getText %)) links))
rlm@0	36 next-page (URL. ((attributes next-node) "href"))]
rlm@0	37 ;; return nil if this is the last page
rlm@0	38 (if (= next-page url) nil next-page)))
rlm@0	39
rlm@0	40 (def poke-first-comic
rlm@0	41 (URL. "http://www.mangareader.net/458-28199-1/pokemon-adventures/chapter-1.html"))
rlm@0	42
rlm@0	43 (def poke-base-directory (file-str "~/poke"))
rlm@0	44
rlm@0	45 (defn poke-next-url
rlm@0	46 "get the next url to visit from the current page we are visiting"
rlm@0	47 [#^URL url]
rlm@0	48 (let
rlm@0	49 [;; extract all links
rlm@0	50 links (tags-by-name (parse url) "a")
rlm@0	51 ;; extract the "Next" link
rlm@0	52 next-node (first (filter #(re-matches #".Next." (.getText %)) links))
rlm@0	53 next-page
rlm@0	54
rlm@0	55 (URL. (str "http://www.mangareader.net" ((attributes next-node) "href")))]
rlm@0	56 ;; return nil if this is the last page
rlm@0	57 (if (= next-page url) nil next-page)))
rlm@0	58
rlm@0	59 (defn poke-comic-img
rlm@0	60 "get the file which we want to download from the webpage"
rlm@0	61 [#^URL url]
rlm@0	62 (let [;; parse the HTML
rlm@0	63 node (parse url)
rlm@0	64 ;; grab all img tags
rlm@0	65 container (first (filter (fn [node] (= ((attributes node) "id")
rlm@0	66 "imgholder"))
rlm@0	67 (tags-by-name node "div")))]
rlm@0	68 ;; extract the comic
rlm@0	69 (URL.
rlm@0	70 ((attributes
rlm@0	71 (last (.getChildren
rlm@0	72 (first
rlm@0	73 (tags-by-name container "a"))))) "src"))))
rlm@0	74
rlm@0	75 (defn poke-comic-name []
rlm@0	76 (let [a (atom 0)]
rlm@0	77 (fn
rlm@0	78 [url]
rlm@0	79 (File. poke-base-directory (str (swap! a inc) ".jpg")))))
rlm@0	80
rlm@0	81 (defn comic-img
rlm@0	82 "get the file which we want to download from the webpage"
rlm@0	83 [#^URL url]
rlm@0	84 (let [;; parse the HTML
rlm@0	85 node (parse url)
rlm@0	86 ;; grab all img tags
rlm@0	87 images (map attributes (tags-by-name node "img"))
rlm@0	88 ;; extract the comic
rlm@0	89 comic (first (filter #(= "comic_image" (% "id")) images))]
rlm@0	90 (URL. (comic "src"))))
rlm@0	91
rlm@0	92 (defn comic-name
rlm@0	93 "get the comic's name"
rlm@0	94 [#^URL url]
rlm@0	95 (let [;; parse the HTML
rlm@0	96 node (parse url)
rlm@0	97 ;; grab the select box
rlm@0	98 select (first (tags-by-name node "select"))
rlm@0	99 ;; extract the selected name which is the name of the comic
rlm@0	100 selected (first (filter #(= "selected" ((attributes %) "selected"))
rlm@0	101 (tags-by-name node "option")))]
rlm@0	102 (File. base-directory
rlm@0	103 (sanitize-file-name (str (.getText selected) ".jpg")))))
rlm@0	104
rlm@0	105 (defn download-url
rlm@0	106 [#^URL url #^File file]
rlm@0	107 (if (.exists file)
rlm@0	108 (println (.getCanonicalPath file) " already exists.")
rlm@0	109 (do
rlm@0	110 (println "Copying " (.getPath url) " to " (.getCanonicalPath file))
rlm@0	111 (copy (input-stream url) file)) ))
rlm@0	112
rlm@0	113 (defn download-comic
rlm@0	114 ([first-comic next-url comic-img comic-name n]
rlm@0	115 ;; create a lazy sequence of web urls by chaining next-url
rlm@0	116 (let [urls (take-while (comp not nil?) (iterate next-url first-comic))]
rlm@0	117 (dorun (take n (map download-url
rlm@0	118 (map comic-img urls)
rlm@0	119 (map comic-name urls)))))))
rlm@0	120
rlm@0	121 ;; first stab at making it more general
rlm@0	122 (defn download-comic-archive
rlm@0	123 [comic-img comic-name comics]
rlm@0	124 (dorun (map download-url
rlm@0	125 (map comic-img comics)
rlm@0	126 (map comic-name comics))))
rlm@0	127
rlm@0	128 ;; (defn download-comic-chain
rlm@0	129 ;; [comic-img comic-name next-url first-comic]
rlm@0	130 ;; [comics (take-while (comp not nil?) (iterate next-url first-comic))]
rlm@0	131 ;; (download-comics comics comic-img comic-name))
rlm@0	132
rlm@0	133 (defn tests []
rlm@0	134 (println (comic-name first-comic))
rlm@0	135 (println (comic-img first-comic))
rlm@0	136 (println (next-url first-comic)))

Mercurial > rlm

annotate src/rlm/pikasemechu.clj @ 0:78a630e650d2