Use depth-first and breadth-first search to find dead links

By | September 2, 2019

The following script is to catch dead links (return 404 HTTP status code) under a domain name.

Depth-first Search (DFS)

After getting the root path, first, take out all the sub-paths, and use urllib to verify these paths. If the verification fails, record them, otherwise use the sub-path as the root path to traverse its sub-paths.

import urllib.request
from urllib.request import URLError
import http.cookiejar
from selenium import webdriver  
passlist = {}
path = {}
def getCurrent(urlRoot):
    try:
        if urlRoot in path:
            return
        driver.get(urlRoot) # domain to be tested
        path[urlRoot] = 1
        # try:
        #     devloplist = driver.find_elements_by_id("in-the-process-of-development")
        #     if(len(devloplist)!=0):
        #         devlopPath = open("devlopPath.txt","a+")
        #         devlopPath.write(urlRoot+"\n")
        #         devlopPath.close()            
        # except:
        #     print("not devlopment")
        finally:
            #urlRoot = urlRoot.split('#')
            urls = driver.find_elements_by_xpath("//a")
            us = []
            for url in urls:
                us.append(url.get_attribute('href'))
            for u in us:
                if u == 'None':
                    continue
                if u.find("https://www.google.com")==-1:
                    continue
                if urlRoot+","+u in passlist:
                    continue
                try:
                    response=urllib.request.urlopen(u)
                except:
                    passlist[urlRoot+","+u]=1
                    errorPath = open("errorPath.txt","a+")
                    errorPath.write(urlRoot+","+u+"\n")
                    errorPath.close()
                    print('Error url:   '+urlRoot+","+ u)
                    continue
                else:
                    passlist[urlRoot+","+u]=1
                    print('Success url:   ' + u)
                    getCurrent(u)
    except:
        return

option=webdriver.ChromeOptions()
#option.add_argument("headless")
driver = webdriver.Chrome('C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe')
getCurrent("https://www.google.com")
driver.close()       

Breadth-First Search (BFS)

Use breadth-first traversal to achieve traversal through the queue, take the path of the head of the team, then add the path that passed the verification to the queue, and record the path that failed to pass the verification.

import urllib.request
from urllib.request import URLError
import http.cookiejar
from selenium import webdriver
import queue
option=webdriver.ChromeOptions()
option.add_argument("headless")
faillist = {}
passlist = {}
q = queue.Queue()
q.put("https://www.google.com")
driver = webdriver.Chrome('C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe',chrome_options = option)
while(not q.empty()):
    try:
        print(q.qsize())
        url = q.get()    
        driver.get(url)
        urls = driver.find_elements_by_xpath("//a") 
        us = []
        for data in urls:
            us.append(data.get_attribute('href'))
        for i in range(len(us)):
            u = us[i]
            if u.find("https://www.google.com")==-1:
                continue
            if u in passlist:
                #print(u+"already visited")
                if passlist[u]==0:# fail
                    errorPath = open("errorPath.txt","a+")
                    errorPath.write(url+","+u+",from fail dict\n")
                    errorPath.close()
                else:
                    continue
            else:
                #print(u+"not visted before")
                try:
                    response=urllib.request.urlopen(u)
                except Exception as e:
                    print(e)
                    errorPath = open("errorPath.txt","a+")
                    errorPath.write(url+","+u+","+str(e)+"\n")
                    errorPath.close()
                    #print('Error url:   '+url+","+ u) 
                    passlist[u]=0
                    continue
                else:
                    #print('Success url:   ' + u) 
                    passlist[u]=1
                    q.put(u)
    except:
        continue