Mercurial > rlm
comparison src/rlm/pikasemechu.clj @ 0:78a630e650d2
initial import
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Tue, 18 Oct 2011 00:57:08 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:78a630e650d2 |
---|---|
1 (ns rlm.pikasemechu | |
2 (:use rlm.ns-rlm)) | |
3 (rlm.ns-rlm/ns-clone rlm.light-base) | |
4 (use 'clojure.java.io) | |
5 (use 'rlm.sanitize-file) | |
6 (import org.htmlcleaner.HtmlCleaner) | |
7 (import org.htmlcleaner.TagNode) | |
8 (import java.io.File) | |
9 (import java.net.URL) | |
10 | |
11 (def first-comic (URL. "http://pikasemechu.smackjeeves.com/comics/165598/cover/")) | |
12 (def base-directory (file-str "~/pikasemechu")) | |
13 | |
14 (defn tags-by-name | |
15 [#^TagNode node #^String element] | |
16 (seq (.getElementListByName node element true))) | |
17 | |
18 (defn-memo parse | |
19 "parse a web page using HtmlCleaner" | |
20 [#^URL url] | |
21 (.clean (HtmlCleaner.) (input-stream url))) | |
22 | |
23 (defn attributes | |
24 "get a hash map of the attributes of an element" | |
25 [#^TagNode node] | |
26 (into {} (.getAttributes node))) | |
27 | |
28 (defn next-url | |
29 "get the next url to visit from the current page we are visiting" | |
30 [#^URL url] | |
31 (let | |
32 [;; extract all links | |
33 links (tags-by-name (parse url) "a") | |
34 ;; extract the "Next" link | |
35 next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links)) | |
36 next-page (URL. ((attributes next-node) "href"))] | |
37 ;; return nil if this is the last page | |
38 (if (= next-page url) nil next-page))) | |
39 | |
40 (def poke-first-comic | |
41 (URL. "http://www.mangareader.net/458-28199-1/pokemon-adventures/chapter-1.html")) | |
42 | |
43 (def poke-base-directory (file-str "~/poke")) | |
44 | |
45 (defn poke-next-url | |
46 "get the next url to visit from the current page we are visiting" | |
47 [#^URL url] | |
48 (let | |
49 [;; extract all links | |
50 links (tags-by-name (parse url) "a") | |
51 ;; extract the "Next" link | |
52 next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links)) | |
53 next-page | |
54 | |
55 (URL. (str "http://www.mangareader.net" ((attributes next-node) "href")))] | |
56 ;; return nil if this is the last page | |
57 (if (= next-page url) nil next-page))) | |
58 | |
59 (defn poke-comic-img | |
60 "get the file which we want to download from the webpage" | |
61 [#^URL url] | |
62 (let [;; parse the HTML | |
63 node (parse url) | |
64 ;; grab all img tags | |
65 container (first (filter (fn [node] (= ((attributes node) "id") | |
66 "imgholder")) | |
67 (tags-by-name node "div")))] | |
68 ;; extract the comic | |
69 (URL. | |
70 ((attributes | |
71 (last (.getChildren | |
72 (first | |
73 (tags-by-name container "a"))))) "src")))) | |
74 | |
75 (defn poke-comic-name [] | |
76 (let [a (atom 0)] | |
77 (fn | |
78 [url] | |
79 (File. poke-base-directory (str (swap! a inc) ".jpg"))))) | |
80 | |
81 (defn comic-img | |
82 "get the file which we want to download from the webpage" | |
83 [#^URL url] | |
84 (let [;; parse the HTML | |
85 node (parse url) | |
86 ;; grab all img tags | |
87 images (map attributes (tags-by-name node "img")) | |
88 ;; extract the comic | |
89 comic (first (filter #(= "comic_image" (% "id")) images))] | |
90 (URL. (comic "src")))) | |
91 | |
92 (defn comic-name | |
93 "get the comic's name" | |
94 [#^URL url] | |
95 (let [;; parse the HTML | |
96 node (parse url) | |
97 ;; grab the select box | |
98 select (first (tags-by-name node "select")) | |
99 ;; extract the selected name which is the name of the comic | |
100 selected (first (filter #(= "selected" ((attributes %) "selected")) | |
101 (tags-by-name node "option")))] | |
102 (File. base-directory | |
103 (sanitize-file-name (str (.getText selected) ".jpg"))))) | |
104 | |
105 (defn download-url | |
106 [#^URL url #^File file] | |
107 (if (.exists file) | |
108 (println (.getCanonicalPath file) " already exists.") | |
109 (do | |
110 (println "Copying " (.getPath url) " to " (.getCanonicalPath file)) | |
111 (copy (input-stream url) file)) )) | |
112 | |
113 (defn download-comic | |
114 ([first-comic next-url comic-img comic-name n] | |
115 ;; create a lazy sequence of web urls by chaining next-url | |
116 (let [urls (take-while (comp not nil?) (iterate next-url first-comic))] | |
117 (dorun (take n (map download-url | |
118 (map comic-img urls) | |
119 (map comic-name urls))))))) | |
120 | |
121 ;; first stab at making it more general | |
122 (defn download-comic-archive | |
123 [comic-img comic-name comics] | |
124 (dorun (map download-url | |
125 (map comic-img comics) | |
126 (map comic-name comics)))) | |
127 | |
128 ;; (defn download-comic-chain | |
129 ;; [comic-img comic-name next-url first-comic] | |
130 ;; [comics (take-while (comp not nil?) (iterate next-url first-comic))] | |
131 ;; (download-comics comics comic-img comic-name)) | |
132 | |
133 (defn tests [] | |
134 (println (comic-name first-comic)) | |
135 (println (comic-img first-comic)) | |
136 (println (next-url first-comic))) |