Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- (ns word-crawler.service
- (:require [clj-http.client :as client]
- [clj-http.cookies :as cookies]
- [word-crawler.config :refer [env]]
- [mount.core :refer [defstate]]
- [clojure.core.async :refer [chan close! >! <!! go go-loop <! thread]]
- [lambdaisland.uri :refer [uri]]
- [remus :as xml]
- [clojure.tools.logging :as log]))
- (def ^:private word-requests (chan 1e6))
- (declare run-crawler)
- (defstate word-crawler
- :start (run-crawler)
- :stop (close! word-crawler))
- (defmulti ^:private search-engine (fn [msg] (:engine msg)) :default :bing)
- (let [bing-cookies-store (cookies/cookie-store)]
- (defmethod search-engine :bing
- [{:keys [word res-fn]}]
- (when (empty? (cookies/get-cookies bing-cookies-store))
- (client/head "https://www.bing.com"
- {:cookie-store bing-cookies-store}))
- (try
- (->> (client/get "https://www.bing.com/search?"
- {:query-params {:q word
- :count 10
- :format "rss"}
- :cookie-store bing-cookies-store
- :as :stream})
- (#(-> % :body xml/parse-stream :entries))
- (map :link)
- res-fn)
- (catch Exception e
- (log/error e)
- (res-fn [])))))
- (defn- run-crawler
- []
- (let [max-http-connection (or (env :max-http-connection) 10)
- connection-run (atom 1)]
- (go-loop []
- (when (< @connection-run max-http-connection)
- (let [msg (<! word-requests)]
- (thread
- (swap! connection-run inc)
- (search-engine msg)
- (swap! connection-run dec))))
- (recur))))
- (defn- get-sld
- [url]
- (re-find #"[^\.]+[\.]{1}[^\.]+$" url))
- #_(time (let [words ["Clojure" "Python" "Delphi" "Scala" "JS" "NodeJS" "Kotlin" "Haskell" "tmp" "Oracle" "Kafka"
- "Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka"]
- c (chan (count words))
- res-fn (fn [res] (go (>! c res)))
- results (atom [])]
- (doseq [w words]
- (go (>! word-requests {:word w :res-fn res-fn})))
- (doseq [w words]
- (let [value (<!! c)]
- (swap! results concat value)))
- (close! c)
- (->> @results distinct (map #(-> % uri :host get-sld)) frequencies)))
- (defn- words->links
- [words]
- (let [c (chan (count words))
- res-fn (fn [res] (go (>! c res)))
- results (atom [])]
- (doseq [w words]
- (go (>! word-requests {:word w :res-fn res-fn})))
- (doseq [_ words]
- (swap! results concat (<!! c)))
- (close! c)
- @results))
- (defn words-frequencies
- [words]
- (let [links (words->links words)]
- (->> links distinct (map #(-> % uri :host get-sld)) frequencies)))
- #_(let []
- (future (time (words->links (take 100 (repeat "clojure")))))
- (future (time (words->links (take 100 (repeat "scala")))))
- (future (time (words->links (take 100 (repeat "kotlin")))))
- (future (time (words->links (take 100 (repeat "delphi"))))))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement