Wednesday, January 30, 2019

Python - HTML Word Extraction

Script to extract a wordlist from a website for use in promo code brute forcer

from BeautifulSoup import BeautifulSoup
import requests
import re

links=set()
html_page = requests.get("http://root.site.abxyz")
soup = BeautifulSoup(html_page.content)

for link in soup.findAll('a'):
 try:
  if 'http' in link.get('href'):
   links.add(link.get('href'))
 except:
  continue
 
print "links to parse: ", str(len(links))
wordlist = []
count = 0
for url in links:
 count = count + 1
 print "on", url, ", number", str(count), "of ", str(len(links))
 try:
  html = requests.get(url).content
  soup = BeautifulSoup(html)
  for script in soup(["script", "style"]):
   garbage = script.extract()
  text = soup.getText()
  lines = (line.strip() for line in text.splitlines())
  chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
  text = '\n'.join(chunk for chunk in chunks if chunk)
  textArr = text.split()
  for word in textArr:
   wordlist.append(word)
  print "length of list is ", str(len(wordlist))
 except Exception as e:
  print "Error, ", str(e)
  continue
  
f=open('wordlist.txt', 'w')
for word in wordlist:
 f.write(text)
 f.write('\n')
 
f.close()

No comments:

Post a Comment