The following script is to catch dead links (return 404 HTTP status code) under a domain name.
Depth-first Search (DFS)
After getting the root path, first, take out all the sub-paths, and use urllib to verify these paths. If the verification fails, record them, otherwise use the sub-path as the root path to traverse its sub-paths.
import urllib.request
from urllib.request import URLError
import http.cookiejar
from selenium import webdriver
passlist = {}
path = {}
def getCurrent(urlRoot):
try:
if urlRoot in path:
return
driver.get(urlRoot) # domain to be tested
path[urlRoot] = 1
# try:
# devloplist = driver.find_elements_by_id("in-the-process-of-development")
# if(len(devloplist)!=0):
# devlopPath = open("devlopPath.txt","a+")
# devlopPath.write(urlRoot+"\n")
# devlopPath.close()
# except:
# print("not devlopment")
finally:
#urlRoot = urlRoot.split('#')
urls = driver.find_elements_by_xpath("//a")
us = []
for url in urls:
us.append(url.get_attribute('href'))
for u in us:
if u == 'None':
continue
if u.find("https://www.google.com")==-1:
continue
if urlRoot+","+u in passlist:
continue
try:
response=urllib.request.urlopen(u)
except:
passlist[urlRoot+","+u]=1
errorPath = open("errorPath.txt","a+")
errorPath.write(urlRoot+","+u+"\n")
errorPath.close()
print('Error url: '+urlRoot+","+ u)
continue
else:
passlist[urlRoot+","+u]=1
print('Success url: ' + u)
getCurrent(u)
except:
return
option=webdriver.ChromeOptions()
#option.add_argument("headless")
driver = webdriver.Chrome('C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe')
getCurrent("https://www.google.com")
driver.close()
Breadth-First Search (BFS)
Use breadth-first traversal to achieve traversal through the queue, take the path of the head of the team, then add the path that passed the verification to the queue, and record the path that failed to pass the verification.
import urllib.request
from urllib.request import URLError
import http.cookiejar
from selenium import webdriver
import queue
option=webdriver.ChromeOptions()
option.add_argument("headless")
faillist = {}
passlist = {}
q = queue.Queue()
q.put("https://www.google.com")
driver = webdriver.Chrome('C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe',chrome_options = option)
while(not q.empty()):
try:
print(q.qsize())
url = q.get()
driver.get(url)
urls = driver.find_elements_by_xpath("//a")
us = []
for data in urls:
us.append(data.get_attribute('href'))
for i in range(len(us)):
u = us[i]
if u.find("https://www.google.com")==-1:
continue
if u in passlist:
#print(u+"already visited")
if passlist[u]==0:# fail
errorPath = open("errorPath.txt","a+")
errorPath.write(url+","+u+",from fail dict\n")
errorPath.close()
else:
continue
else:
#print(u+"not visted before")
try:
response=urllib.request.urlopen(u)
except Exception as e:
print(e)
errorPath = open("errorPath.txt","a+")
errorPath.write(url+","+u+","+str(e)+"\n")
errorPath.close()
#print('Error url: '+url+","+ u)
passlist[u]=0
continue
else:
#print('Success url: ' + u)
passlist[u]=1
q.put(u)
except:
continue