Advertisement
lichenran1234

WebCrawler

Apr 11th, 2021
266
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 2.33 KB | None | 0 0
  1. using namespace std;
  2.  
  3. class WebCrawlerHelper {
  4.  public:
  5.     string parse(string url) {return "";}
  6.     vector<string> fetch(string url) {return vector<string>();}
  7.     void writeToDisk(string url) {}
  8. };
  9.  
  10. class WebCrawler {
  11.  public:
  12.     vector<pair<string, string>> Craw(int num_threads, string starting_url, int max_depth) {
  13.         crawled_url_set_.insert(starting_url);
  14.         url_queue_.push({0, starting_url});
  15.         max_depth_ = max_depth;
  16.        
  17.         vector<thread> threads;
  18.         for (int i = 0; i < num_threads; ++i) {
  19.             threads.push_back(thread(&WebCrawler::crawlerWorker, this));
  20.         }
  21.        
  22.         for (int i = 0; i < threads.size(); ++i) {
  23.             threads[i].join();
  24.         }
  25.        
  26.         return crawler_result_;
  27.     }
  28.  private:
  29.     void crawlerWorker() {
  30.         while (true) {
  31.             unique_lock<mutex> lock(mutex_);
  32.             cv_.wait(lock, [this]{return !url_queue_.empty() || num_working_threads_ == 0;});
  33.             if (url_queue_.empty()) {
  34.                 break;
  35.             }
  36.             pair<int, string> current_url_pair = url_queue_.front();
  37.             url_queue_.pop();
  38.             ++num_working_threads_;
  39.             lock.unlock();
  40.            
  41.             int current_depth = current_url_pair.first;
  42.             string current_url = current_url_pair.second;
  43.             string webpage_content = helper_.parse(current_url);
  44.             vector<string> next_urls;
  45.             if (current_depth + 1 <= max_depth_) {
  46.                 next_urls = helper_.fetch(current_url);
  47.             }
  48.            
  49.             lock.lock();
  50.             crawler_result_.push_back({current_url, webpage_content});
  51.             for (string& url : next_urls) {
  52.                 if (crawled_url_set_.find(url) == crawled_url_set_.end()) {
  53.                     crawled_url_set_.insert(url);
  54.                     url_queue_.push({current_depth + 1, url});
  55.                 }
  56.             }
  57.             --num_working_threads_;
  58.             cv_.notify_all();
  59.             lock.unlock();
  60.         }
  61.     }
  62.    
  63.     int max_depth_;
  64.    
  65.     WebCrawlerHelper helper_;
  66.    
  67.     queue<pair<int, string>> url_queue_;
  68.     set<string> crawled_url_set_;
  69.     vector<pair<string, string>> crawler_result_;
  70.    
  71.     int num_working_threads_ = 0;
  72.     mutex mutex_;
  73.     condition_variable cv_;
  74. };
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement