from BeautifulSoup import BeautifulSoup import requests import re links=set() html_page = requests.get("http://root.site.abxyz") soup = BeautifulSoup(html_page.content) for link in soup.findAll('a'): try: if 'http' in link.get('href'): links.add(link.get('href')) except: continue print "links to parse: ", str(len(links)) wordlist = [] count = 0 for url in links: count = count + 1 print "on", url, ", number", str(count), "of ", str(len(links)) try: html = requests.get(url).content soup = BeautifulSoup(html) for script in soup(["script", "style"]): garbage = script.extract() text = soup.getText() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = '\n'.join(chunk for chunk in chunks if chunk) textArr = text.split() for word in textArr: wordlist.append(word) print "length of list is ", str(len(wordlist)) except Exception as e: print "Error, ", str(e) continue f=open('wordlist.txt', 'w') for word in wordlist: f.write(text) f.write('\n') f.close()
Wednesday, January 30, 2019
Python - HTML Word Extraction
Script to extract a wordlist from a website for use in promo code brute forcer
Subscribe to:
Comments (Atom)