Exercise: Fetch URLs from one site.


Download the sitemap or the other sitemap file and fetch the first N URLs from there. Collecting the titles.


examples/parallel/fetch_site_urls.py
import time
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

def get_urls(content):
    urls = []
    root = ET.fromstring(content)
    for child in root:
        for ch in child:
            if ch.tag.endswith('loc'):
                urls.append(ch.text)
    #print(len(urls)) # 2653
    MAX = 20
    if len(urls) > MAX:
        urls = urls[:MAX]

    return urls

def main():
    start = time.time()
    url = 'https://code-maven.com/slides/sitemap.xml'
    resp = requests.get(url)
    if resp.status_code != 200:
        exit(f"Incorrect status_code {resp.status_code}")

    urls = get_urls(resp.content)

    titles = []
    for url in urls:
        resp = requests.get(url)
        if resp.status_code != 200:
            print(f"Incorrect status_code {resp.status_code} for {url}")
            continue

        soup = BeautifulSoup(resp.content, 'html.parser')
        print(soup.title.string)
        titles.append(soup.title.string)
    end = time.time()
    print("Elapsed time: {} for {} pages.".format(end-start, len(urls)))
    print(titles)


if __name__ == '__main__':
    main()