Learn to discover contact pages and extract business emails the right way.
This guide blends ethics, compliance, and code patterns—and includes an in-browser demo for quick tests.
Do
Respect /robots.txt & Terms.
Rate-limit & identify your bot.
Use data for legitimate business purposes.
Don’t
Bypass logins, paywalls, or CAPTCHA.
Harvest personal emails for spam.
Ignore removal/opt-out requests.
Contents
Overview
“Contact Us” page scraping focuses on collecting publicly listed, business-purpose addresses like info@, support@, or hello@. These often live on Contact, About, Support, or Team pages. The key is to keep your approach lightweight, respectful, and traceable.
Tip: parse mailto: links first (they’re high-signal), then fall back to a well-tuned regex for visible text.
Ethics & Compliance
Follow website rules and /robots.txt.
Collect only business-intent emails; avoid personal addresses.
Store source URLs (provenance) and honor opt-outs.
Performance & Stability
1–2 requests/sec is a reasonable default.
Handle transient errors with retries and backoff.
Keep crawl depth small (1–2) and prioritize “contact-like” links.
import time, re, random
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
import urllib.robotparser as urobot
UA = "ContactFinderBot/1.0 (+yourdomain.example; contact@yourdomain.example)"
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": UA, "Accept-Language": "en"})
def allowed_by_robots(url, ua=UA):
p = urlparse(url)
rp = urobot.RobotFileParser()
rp.set_url(f"{p.scheme}://{p.netloc}/robots.txt")
try: rp.read()
except Exception: return True
return rp.can_fetch(ua, url)
def fetch(url, sleep=(0.8, 1.8), timeout=15):
if not allowed_by_robots(url): return None
try:
time.sleep(random.uniform(*sleep))
r = SESSION.get(url, timeout=timeout)
if r.ok and "text/html" in r.headers.get("Content-Type",""):
return r.text
except requests.RequestException:
return None
return None
Score & extract internal links
from collections import defaultdict
HINTS = ("contact","support","help","customer","about","team","get-in-touch")
def extract_scored_links(html, base_url):
soup = BeautifulSoup(html, "html.parser")
scored = defaultdict(int)
for a in soup.find_all("a", href=True):
href = a["href"].strip()
text = (a.get_text(" ", strip=True) or "").lower()
absu = urljoin(base_url, href)
if urlparse(absu).netloc != urlparse(base_url).netloc:
continue
hay = f"{href.lower()} {text}"
for hint in HINTS:
if hint in hay: scored[absu] += 1
return sorted(scored.items(), key=lambda kv: kv[1], reverse=True)
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,25}")
def parse_emails(html, base_url):
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text(" ", strip=True)
for pat in ("[at]","(at)"," [at] "," [dot] ","(dot)","{at}","{dot}"):
text = text.replace(pat, " ").replace(" at ", " ").replace(" dot ", " ")
# mailto first
emails = set()
for a in soup.find_all("a", href=True):
href = a["href"].strip()
if href.lower().startswith("mailto:"):
addr = href.split(":",1)[1].split("?")[0]
if EMAIL_RE.fullmatch(addr): emails.add(addr)
# visible text
for cand in EMAIL_RE.findall(text):
emails.add(cand)
# normalize & prioritize
host = urlparse(base_url).netloc.split(":")[0].lower()
norm = sorted(set(e.lower() for e in emails),
key=lambda e: (e.endswith("@"+host), e.startswith(("info@","support@","hello@","sales@"))),
reverse=True)
return norm
Mini crawler (depth ≤ 2)
from collections import deque
def find_contact_emails(seed_url, max_pages=15, max_depth=2):
seen = set([seed_url])
q = deque([(seed_url, 0)])
out = {"seed": seed_url, "contacts": {}, "errors": []}
while q and len(seen) <= max_pages:
url, d = q.popleft()
html = fetch(url)
if not html:
out["errors"].append(url);
continue
found = parse_emails(html, url)
if found: out["contacts"][url] = found
if d < max_depth:
for link, score in extract_scored_links(html, url):
if score > 0 and link not in seen:
seen.add(link); q.append((link, d+1))
return out
# print(find_contact_emails("https://example.com"))
Keep max_pages conservative and always include a contact address in your User-Agent.
Beautiful Contact Us Scraper Section
Try the Ready-Made Apify Scraper
If you don't want to code everything from scratch, you can use my pre-built Apify scraper. It handles rotation, scaling, and compliance for you.
Our scraper is designed with developers in mind, providing a robust solution that respects Contact Us's terms of service while delivering the data you need for your projects.
Ready to get started?
Run the pre-built Contact Us scraper without coding