view src/rlm/pikasemechu.clj @ 0:78a630e650d2

initial import
author Robert McIntyre <rlm@mit.edu>
date Tue, 18 Oct 2011 00:57:08 -0700
parents
children
line wrap: on
line source
1 (ns rlm.pikasemechu
2 (:use rlm.ns-rlm))
3 (rlm.ns-rlm/ns-clone rlm.light-base)
4 (use 'clojure.java.io)
5 (use 'rlm.sanitize-file)
6 (import org.htmlcleaner.HtmlCleaner)
7 (import org.htmlcleaner.TagNode)
8 (import java.io.File)
9 (import java.net.URL)
11 (def first-comic (URL. "http://pikasemechu.smackjeeves.com/comics/165598/cover/"))
12 (def base-directory (file-str "~/pikasemechu"))
14 (defn tags-by-name
15 [#^TagNode node #^String element]
16 (seq (.getElementListByName node element true)))
18 (defn-memo parse
19 "parse a web page using HtmlCleaner"
20 [#^URL url]
21 (.clean (HtmlCleaner.) (input-stream url)))
23 (defn attributes
24 "get a hash map of the attributes of an element"
25 [#^TagNode node]
26 (into {} (.getAttributes node)))
28 (defn next-url
29 "get the next url to visit from the current page we are visiting"
30 [#^URL url]
31 (let
32 [;; extract all links
33 links (tags-by-name (parse url) "a")
34 ;; extract the "Next" link
35 next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links))
36 next-page (URL. ((attributes next-node) "href"))]
37 ;; return nil if this is the last page
38 (if (= next-page url) nil next-page)))
40 (def poke-first-comic
41 (URL. "http://www.mangareader.net/458-28199-1/pokemon-adventures/chapter-1.html"))
43 (def poke-base-directory (file-str "~/poke"))
45 (defn poke-next-url
46 "get the next url to visit from the current page we are visiting"
47 [#^URL url]
48 (let
49 [;; extract all links
50 links (tags-by-name (parse url) "a")
51 ;; extract the "Next" link
52 next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links))
53 next-page
55 (URL. (str "http://www.mangareader.net" ((attributes next-node) "href")))]
56 ;; return nil if this is the last page
57 (if (= next-page url) nil next-page)))
59 (defn poke-comic-img
60 "get the file which we want to download from the webpage"
61 [#^URL url]
62 (let [;; parse the HTML
63 node (parse url)
64 ;; grab all img tags
65 container (first (filter (fn [node] (= ((attributes node) "id")
66 "imgholder"))
67 (tags-by-name node "div")))]
68 ;; extract the comic
69 (URL.
70 ((attributes
71 (last (.getChildren
72 (first
73 (tags-by-name container "a"))))) "src"))))
75 (defn poke-comic-name []
76 (let [a (atom 0)]
77 (fn
78 [url]
79 (File. poke-base-directory (str (swap! a inc) ".jpg")))))
81 (defn comic-img
82 "get the file which we want to download from the webpage"
83 [#^URL url]
84 (let [;; parse the HTML
85 node (parse url)
86 ;; grab all img tags
87 images (map attributes (tags-by-name node "img"))
88 ;; extract the comic
89 comic (first (filter #(= "comic_image" (% "id")) images))]
90 (URL. (comic "src"))))
92 (defn comic-name
93 "get the comic's name"
94 [#^URL url]
95 (let [;; parse the HTML
96 node (parse url)
97 ;; grab the select box
98 select (first (tags-by-name node "select"))
99 ;; extract the selected name which is the name of the comic
100 selected (first (filter #(= "selected" ((attributes %) "selected"))
101 (tags-by-name node "option")))]
102 (File. base-directory
103 (sanitize-file-name (str (.getText selected) ".jpg")))))
105 (defn download-url
106 [#^URL url #^File file]
107 (if (.exists file)
108 (println (.getCanonicalPath file) " already exists.")
109 (do
110 (println "Copying " (.getPath url) " to " (.getCanonicalPath file))
111 (copy (input-stream url) file)) ))
113 (defn download-comic
114 ([first-comic next-url comic-img comic-name n]
115 ;; create a lazy sequence of web urls by chaining next-url
116 (let [urls (take-while (comp not nil?) (iterate next-url first-comic))]
117 (dorun (take n (map download-url
118 (map comic-img urls)
119 (map comic-name urls)))))))
121 ;; first stab at making it more general
122 (defn download-comic-archive
123 [comic-img comic-name comics]
124 (dorun (map download-url
125 (map comic-img comics)
126 (map comic-name comics))))
128 ;; (defn download-comic-chain
129 ;; [comic-img comic-name next-url first-comic]
130 ;; [comics (take-while (comp not nil?) (iterate next-url first-comic))]
131 ;; (download-comics comics comic-img comic-name))
133 (defn tests []
134 (println (comic-name first-comic))
135 (println (comic-img first-comic))
136 (println (next-url first-comic)))