Exercise: Fetch URLs in parallel



examples/parallel/urls.txt
https://google.com/
https://youtube.com/
https://facebook.com/
https://baidu.com/
https://twitter.com/
https://instagram.com/
https://en.wikipedia.org/
https://www.amazon.com/
https://yahoo.com/
https://yandex.ru/
https://vk.com/
https://live.com/
https://naver.com/
https://yahoo.co.jp/
https://google.com.br/
https://netflix.com/
https://reddit.com/
https://ok.ru/
https://mail.ru/
https://ebay.com/
https://linkedin.com/
https://qq.com/
https://pinterest.com/
https://bing.com/
https://whatsapp.com/
https://office.com/
https://amazon.de/
https://aliexpress.com/
https://amazon.co.jp/
https://msn.com/
https://google.de/
https://paypal.com/
https://rakuten.co.jp/
https://amazon.co.uk/
https://daum.net/
https://google.co.jp/
https://imdb.com/
https://booking.com/
https://roblox.com/
https://9apps.com/
https://globo.com/
https://duckduckgo.com/
https://www.nttdocomo.co.jp/

examples/parallel/fetch_urls.py
import time
import requests
import sys
from bs4 import BeautifulSoup

def get_urls(limit):
    with open('urls.txt') as fh:
        urls = list(map(lambda line: line.rstrip("\n"), fh))
    if len(urls) > limit:
        urls = urls[:limit]

    return urls

def get_title(url):
    try:
        resp = requests.get(url)
        if resp.status_code != 200:
            return None, f"Incorrect status_code {resp.status_code} for {url}"
    except Exception as err:
        return None, f"Error: {err} for {url}"

    soup = BeautifulSoup(resp.content, 'html.parser')
    return soup.title.string, None

def main():
    if len(sys.argv) < 2:
        exit(f"Usage: {sys.argv[0]} LIMIT")
    limit = int(sys.argv[1])
    urls = get_urls(limit)
    print(urls)
    start = time.time()

    titles = []
    for url in urls:
        #print(f"Processing {url}")
        title, err = get_title(url)
        if err:
            print(err)
        else:
            print(title)
        titles.append({
            "url": url,
            "title": title,
            "err": err,
        })
    end = time.time()
    print("Elapsed time: {} for {} pages.".format(end-start, len(urls)))
    print(titles)


if __name__ == '__main__':
    main()