Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using namespace std;
- class WebCrawlerHelper {
- public:
- string parse(string url) {return "";}
- vector<string> fetch(string url) {return vector<string>();}
- void writeToDisk(string url) {}
- };
- class WebCrawler {
- public:
- vector<pair<string, string>> Craw(int num_threads, string starting_url, int max_depth) {
- crawled_url_set_.insert(starting_url);
- url_queue_.push({0, starting_url});
- max_depth_ = max_depth;
- vector<thread> threads;
- for (int i = 0; i < num_threads; ++i) {
- threads.push_back(thread(&WebCrawler::crawlerWorker, this));
- }
- for (int i = 0; i < threads.size(); ++i) {
- threads[i].join();
- }
- return crawler_result_;
- }
- private:
- void crawlerWorker() {
- while (true) {
- unique_lock<mutex> lock(mutex_);
- cv_.wait(lock, [this]{return !url_queue_.empty() || num_working_threads_ == 0;});
- if (url_queue_.empty()) {
- break;
- }
- pair<int, string> current_url_pair = url_queue_.front();
- url_queue_.pop();
- ++num_working_threads_;
- lock.unlock();
- int current_depth = current_url_pair.first;
- string current_url = current_url_pair.second;
- string webpage_content = helper_.parse(current_url);
- vector<string> next_urls;
- if (current_depth + 1 <= max_depth_) {
- next_urls = helper_.fetch(current_url);
- }
- lock.lock();
- crawler_result_.push_back({current_url, webpage_content});
- for (string& url : next_urls) {
- if (crawled_url_set_.find(url) == crawled_url_set_.end()) {
- crawled_url_set_.insert(url);
- url_queue_.push({current_depth + 1, url});
- }
- }
- --num_working_threads_;
- cv_.notify_all();
- lock.unlock();
- }
- }
- int max_depth_;
- WebCrawlerHelper helper_;
- queue<pair<int, string>> url_queue_;
- set<string> crawled_url_set_;
- vector<pair<string, string>> crawler_result_;
- int num_working_threads_ = 0;
- mutex mutex_;
- condition_variable cv_;
- };
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement