WebCrawler

using namespace std;

class WebCrawlerHelper {
 public:
    string parse(string url) {return "";}
    vector<string> fetch(string url) {return vector<string>();}
    void writeToDisk(string url) {}
};

class WebCrawler {
 public:
    vector<pair<string, string>> Craw(int num_threads, string starting_url, int max_depth) {
        crawled_url_set_.insert(starting_url);
        url_queue_.push({0, starting_url});
        max_depth_ = max_depth;

        vector<thread> threads;
        for (int i = 0; i < num_threads; ++i) {
            threads.push_back(thread(&WebCrawler::crawlerWorker, this));
        }

        for (int i = 0; i < threads.size(); ++i) {
            threads[i].join();
        }

        return crawler_result_;
    }
 private:
    void crawlerWorker() {
        while (true) {
            unique_lock<mutex> lock(mutex_);
            cv_.wait(lock, [this]{return !url_queue_.empty() || num_working_threads_ == 0;});
            if (url_queue_.empty()) {
                break;
            }
            pair<int, string> current_url_pair = url_queue_.front();
            url_queue_.pop();
            ++num_working_threads_;
            lock.unlock();

            int current_depth = current_url_pair.first;
            string current_url = current_url_pair.second;
            string webpage_content = helper_.parse(current_url);
            vector<string> next_urls;
            if (current_depth + 1 <= max_depth_) {
                next_urls = helper_.fetch(current_url);
            }

            lock.lock();
            crawler_result_.push_back({current_url, webpage_content});
            for (string& url : next_urls) {
                if (crawled_url_set_.find(url) == crawled_url_set_.end()) {
                    crawled_url_set_.insert(url);
                    url_queue_.push({current_depth + 1, url});
                }
            }
            --num_working_threads_;
            cv_.notify_all();
            lock.unlock();
        }
    }

    int max_depth_;

    WebCrawlerHelper helper_;

    queue<pair<int, string>> url_queue_;
    set<string> crawled_url_set_;
    vector<pair<string, string>> crawler_result_;

    int num_working_threads_ = 0;
    mutex mutex_;
    condition_variable cv_;
};