rlm: src/rlm/pikasemechu.clj comparison

comparison src/rlm/pikasemechu.clj @ 0:78a630e650d2

initial import

author	Robert McIntyre <rlm@mit.edu>
date	Tue, 18 Oct 2011 00:57:08 -0700
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:78a630e650d2
+(ns rlm.pikasemechu
+(:use rlm.ns-rlm))
+(rlm.ns-rlm/ns-clone rlm.light-base)
+(use 'clojure.java.io)
+(use 'rlm.sanitize-file)
+(import org.htmlcleaner.HtmlCleaner)
+(import org.htmlcleaner.TagNode)
+(import java.io.File)
+(import java.net.URL)
+(def first-comic (URL. "http://pikasemechu.smackjeeves.com/comics/165598/cover/"))
+(def base-directory (file-str "~/pikasemechu"))
+(defn tags-by-name
+[#^TagNode node #^String element]
+(seq (.getElementListByName node element true)))
+(defn-memo parse
+"parse a web page using HtmlCleaner"
+[#^URL url]
+(.clean (HtmlCleaner.) (input-stream url)))
+(defn attributes
+"get a hash map of the attributes of an element"
+[#^TagNode node]
+(into {} (.getAttributes node)))
+(defn next-url
+"get the next url to visit from the current page we are visiting"
+[#^URL url]
+(let
+[;; extract all links
+links (tags-by-name (parse url) "a")
+;; extract the "Next" link
+next-node (first (filter  #(re-matches #".*Next.*" (.getText %)) links))
+next-page (URL. ((attributes next-node) "href"))]
+;; return nil if this is the last page
+(if (= next-page url) nil next-page)))
+(def poke-first-comic
+(URL. "http://www.mangareader.net/458-28199-1/pokemon-adventures/chapter-1.html"))
+(def poke-base-directory (file-str "~/poke"))
+(defn poke-next-url
+"get the next url to visit from the current page we are visiting"
+[#^URL url]
+(let
+[;; extract all links
+links (tags-by-name (parse url) "a")
+;; extract the "Next" link
+next-node (first (filter  #(re-matches #".*Next.*" (.getText %)) links))
+next-page
+(URL. (str "http://www.mangareader.net" ((attributes next-node) "href")))]
+;; return nil if this is the last page
+(if (= next-page url) nil next-page)))
+(defn poke-comic-img
+"get the file which we want to download from the webpage"
+[#^URL url]
+(let [;; parse the HTML
+	node (parse url)
+	;; grab all img tags
+	container (first (filter (fn [node] (= ((attributes node) "id")
+					       "imgholder"))
+				   (tags-by-name node "div")))]
+	;; extract the comic
+(URL.
+((attributes
+(last (.getChildren
+	      (first
+	       (tags-by-name container "a"))))) "src"))))
+(defn poke-comic-name []
+(let [a (atom 0)]
+(fn
+[url]
+(File. poke-base-directory (str (swap! a inc) ".jpg")))))
+(defn comic-img
+"get the file which we want to download from the webpage"
+[#^URL url]
+(let [;; parse the HTML
+	node (parse url)
+	;; grab all img tags
+	images (map attributes (tags-by-name node "img"))
+	;; extract the comic
+	comic (first (filter #(= "comic_image" (% "id")) images))]
+(URL. (comic "src"))))
+(defn comic-name
+"get the comic's name"
+[#^URL url]
+(let [;; parse the HTML
+	node (parse url)
+	;; grab the select box
+	select (first (tags-by-name node "select"))
+	;; extract the selected name which is the name of the comic
+	selected (first (filter #(= "selected" ((attributes %) "selected"))
+				(tags-by-name node "option")))]
+(File. base-directory
+	   (sanitize-file-name (str (.getText selected) ".jpg")))))
+(defn download-url
+[#^URL url #^File file]
+(if (.exists file)
+(println (.getCanonicalPath file) " already exists.")
+(do
+(println "Copying " (.getPath url) " to " (.getCanonicalPath file))
+(copy (input-stream url) file)) ))
+(defn download-comic
+([first-comic next-url comic-img comic-name n]
+;; create a lazy sequence of web urls by chaining next-url
+(let [urls (take-while (comp not nil?) (iterate next-url first-comic))]
+(dorun (take n (map download-url
+			(map comic-img urls)
+			(map comic-name urls)))))))
+;; first stab at making it more general
+(defn download-comic-archive
+[comic-img comic-name comics]
+(dorun (map download-url
+	      (map comic-img comics)
+	      (map comic-name comics))))
+;; (defn download-comic-chain
+;;   [comic-img comic-name next-url first-comic]
+;;   [comics (take-while (comp not nil?) (iterate next-url first-comic))]
+;;        (download-comics comics comic-img comic-name))
+(defn tests []
+(println (comic-name first-comic))
+(println (comic-img first-comic))
+(println (next-url first-comic)))

Mercurial > rlm

comparison src/rlm/pikasemechu.clj @ 0:78a630e650d2