Remember that you should not use regex for actual HTML parsing (Thanks @Patrick Artner), but you can use beautiful soup to extract all visible text or comments on a web page. Then you can use this text (which is just a string) to look for email addresses. Here is how you can do it:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib
import re
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(body):
soup = BeautifulSoup(body, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
with urllib.request.urlopen("https://en.wikipedia.org/wiki/Email_address") as url:
data = url.read().decode()
text = text_from_html(data)
print(re.findall(r"[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*", text))
The two helper functions just grab all text that can be seen on the page, and then the ridiculously long regex just pulls all email addresses from that text. I used wikipedia.com's article on emails as an example, and here is the output:
['[email protected]', 'local-part@domain', '[email protected]', '[email protected]', 'local-part@domain', '[email protected]', 'fred+bah@domain', 'fred+foo@domain', 'fred@domain', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', 'admin@mailserver1', "#!$%&'*+-/=?^_`{}|[email protected]", '[email protected]', 'user@localserver', 'A@b', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '1234567890123456789012345678901234567890123456789012345678901234+x@example.com', '[email protected]', 'example@localhost', 'john.doe@example', '[email protected]', '[email protected]', '[email protected]', '[email protected]']