rlm@0
|
1 (ns rlm.pikasemechu
|
rlm@0
|
2 (:use rlm.ns-rlm))
|
rlm@0
|
3 (rlm.ns-rlm/ns-clone rlm.light-base)
|
rlm@0
|
4 (use 'clojure.java.io)
|
rlm@0
|
5 (use 'rlm.sanitize-file)
|
rlm@0
|
6 (import org.htmlcleaner.HtmlCleaner)
|
rlm@0
|
7 (import org.htmlcleaner.TagNode)
|
rlm@0
|
8 (import java.io.File)
|
rlm@0
|
9 (import java.net.URL)
|
rlm@0
|
10
|
rlm@0
|
11 (def first-comic (URL. "http://pikasemechu.smackjeeves.com/comics/165598/cover/"))
|
rlm@0
|
12 (def base-directory (file-str "~/pikasemechu"))
|
rlm@0
|
13
|
rlm@0
|
14 (defn tags-by-name
|
rlm@0
|
15 [#^TagNode node #^String element]
|
rlm@0
|
16 (seq (.getElementListByName node element true)))
|
rlm@0
|
17
|
rlm@0
|
18 (defn-memo parse
|
rlm@0
|
19 "parse a web page using HtmlCleaner"
|
rlm@0
|
20 [#^URL url]
|
rlm@0
|
21 (.clean (HtmlCleaner.) (input-stream url)))
|
rlm@0
|
22
|
rlm@0
|
23 (defn attributes
|
rlm@0
|
24 "get a hash map of the attributes of an element"
|
rlm@0
|
25 [#^TagNode node]
|
rlm@0
|
26 (into {} (.getAttributes node)))
|
rlm@0
|
27
|
rlm@0
|
28 (defn next-url
|
rlm@0
|
29 "get the next url to visit from the current page we are visiting"
|
rlm@0
|
30 [#^URL url]
|
rlm@0
|
31 (let
|
rlm@0
|
32 [;; extract all links
|
rlm@0
|
33 links (tags-by-name (parse url) "a")
|
rlm@0
|
34 ;; extract the "Next" link
|
rlm@0
|
35 next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links))
|
rlm@0
|
36 next-page (URL. ((attributes next-node) "href"))]
|
rlm@0
|
37 ;; return nil if this is the last page
|
rlm@0
|
38 (if (= next-page url) nil next-page)))
|
rlm@0
|
39
|
rlm@0
|
40 (def poke-first-comic
|
rlm@0
|
41 (URL. "http://www.mangareader.net/458-28199-1/pokemon-adventures/chapter-1.html"))
|
rlm@0
|
42
|
rlm@0
|
43 (def poke-base-directory (file-str "~/poke"))
|
rlm@0
|
44
|
rlm@0
|
45 (defn poke-next-url
|
rlm@0
|
46 "get the next url to visit from the current page we are visiting"
|
rlm@0
|
47 [#^URL url]
|
rlm@0
|
48 (let
|
rlm@0
|
49 [;; extract all links
|
rlm@0
|
50 links (tags-by-name (parse url) "a")
|
rlm@0
|
51 ;; extract the "Next" link
|
rlm@0
|
52 next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links))
|
rlm@0
|
53 next-page
|
rlm@0
|
54
|
rlm@0
|
55 (URL. (str "http://www.mangareader.net" ((attributes next-node) "href")))]
|
rlm@0
|
56 ;; return nil if this is the last page
|
rlm@0
|
57 (if (= next-page url) nil next-page)))
|
rlm@0
|
58
|
rlm@0
|
59 (defn poke-comic-img
|
rlm@0
|
60 "get the file which we want to download from the webpage"
|
rlm@0
|
61 [#^URL url]
|
rlm@0
|
62 (let [;; parse the HTML
|
rlm@0
|
63 node (parse url)
|
rlm@0
|
64 ;; grab all img tags
|
rlm@0
|
65 container (first (filter (fn [node] (= ((attributes node) "id")
|
rlm@0
|
66 "imgholder"))
|
rlm@0
|
67 (tags-by-name node "div")))]
|
rlm@0
|
68 ;; extract the comic
|
rlm@0
|
69 (URL.
|
rlm@0
|
70 ((attributes
|
rlm@0
|
71 (last (.getChildren
|
rlm@0
|
72 (first
|
rlm@0
|
73 (tags-by-name container "a"))))) "src"))))
|
rlm@0
|
74
|
rlm@0
|
75 (defn poke-comic-name []
|
rlm@0
|
76 (let [a (atom 0)]
|
rlm@0
|
77 (fn
|
rlm@0
|
78 [url]
|
rlm@0
|
79 (File. poke-base-directory (str (swap! a inc) ".jpg")))))
|
rlm@0
|
80
|
rlm@0
|
81 (defn comic-img
|
rlm@0
|
82 "get the file which we want to download from the webpage"
|
rlm@0
|
83 [#^URL url]
|
rlm@0
|
84 (let [;; parse the HTML
|
rlm@0
|
85 node (parse url)
|
rlm@0
|
86 ;; grab all img tags
|
rlm@0
|
87 images (map attributes (tags-by-name node "img"))
|
rlm@0
|
88 ;; extract the comic
|
rlm@0
|
89 comic (first (filter #(= "comic_image" (% "id")) images))]
|
rlm@0
|
90 (URL. (comic "src"))))
|
rlm@0
|
91
|
rlm@0
|
92 (defn comic-name
|
rlm@0
|
93 "get the comic's name"
|
rlm@0
|
94 [#^URL url]
|
rlm@0
|
95 (let [;; parse the HTML
|
rlm@0
|
96 node (parse url)
|
rlm@0
|
97 ;; grab the select box
|
rlm@0
|
98 select (first (tags-by-name node "select"))
|
rlm@0
|
99 ;; extract the selected name which is the name of the comic
|
rlm@0
|
100 selected (first (filter #(= "selected" ((attributes %) "selected"))
|
rlm@0
|
101 (tags-by-name node "option")))]
|
rlm@0
|
102 (File. base-directory
|
rlm@0
|
103 (sanitize-file-name (str (.getText selected) ".jpg")))))
|
rlm@0
|
104
|
rlm@0
|
105 (defn download-url
|
rlm@0
|
106 [#^URL url #^File file]
|
rlm@0
|
107 (if (.exists file)
|
rlm@0
|
108 (println (.getCanonicalPath file) " already exists.")
|
rlm@0
|
109 (do
|
rlm@0
|
110 (println "Copying " (.getPath url) " to " (.getCanonicalPath file))
|
rlm@0
|
111 (copy (input-stream url) file)) ))
|
rlm@0
|
112
|
rlm@0
|
113 (defn download-comic
|
rlm@0
|
114 ([first-comic next-url comic-img comic-name n]
|
rlm@0
|
115 ;; create a lazy sequence of web urls by chaining next-url
|
rlm@0
|
116 (let [urls (take-while (comp not nil?) (iterate next-url first-comic))]
|
rlm@0
|
117 (dorun (take n (map download-url
|
rlm@0
|
118 (map comic-img urls)
|
rlm@0
|
119 (map comic-name urls)))))))
|
rlm@0
|
120
|
rlm@0
|
121 ;; first stab at making it more general
|
rlm@0
|
122 (defn download-comic-archive
|
rlm@0
|
123 [comic-img comic-name comics]
|
rlm@0
|
124 (dorun (map download-url
|
rlm@0
|
125 (map comic-img comics)
|
rlm@0
|
126 (map comic-name comics))))
|
rlm@0
|
127
|
rlm@0
|
128 ;; (defn download-comic-chain
|
rlm@0
|
129 ;; [comic-img comic-name next-url first-comic]
|
rlm@0
|
130 ;; [comics (take-while (comp not nil?) (iterate next-url first-comic))]
|
rlm@0
|
131 ;; (download-comics comics comic-img comic-name))
|
rlm@0
|
132
|
rlm@0
|
133 (defn tests []
|
rlm@0
|
134 (println (comic-name first-comic))
|
rlm@0
|
135 (println (comic-img first-comic))
|
rlm@0
|
136 (println (next-url first-comic)))
|