disposable-email-domains/verify.py

133 lines
3.8 KiB
Python
Raw Permalink Normal View History

2018-02-05 10:22:36 +08:00
#!/usr/bin/env python
"""Verify the integrity of the domain blocklist
2018-02-05 10:22:36 +08:00
"""
2018-02-05 11:09:52 +08:00
import io
2018-02-05 10:22:36 +08:00
import sys
2018-08-24 11:44:20 +08:00
from collections import Counter
2018-02-05 10:22:36 +08:00
from publicsuffixlist import PublicSuffixList
2018-02-05 12:55:37 +08:00
from requests import get
2018-02-05 10:22:36 +08:00
2018-08-24 11:44:20 +08:00
blocklist = "disposable_email_blocklist.conf"
allowlist = "allowlist.conf"
2018-08-24 11:44:20 +08:00
files = {
filename: open(filename).read().splitlines() for filename in [allowlist, blocklist]
2018-08-24 11:44:20 +08:00
}
def download_suffixes():
with open("public_suffix_list.dat", "wb") as file:
response = get("https://publicsuffix.org/list/public_suffix_list.dat")
file.write(response.content)
def check_for_public_suffixes(filename):
lines = files[filename]
2018-02-05 11:09:52 +08:00
suffix_detected = False
2018-02-05 12:55:37 +08:00
psl = None
with open("public_suffix_list.dat", "r") as latest:
psl = PublicSuffixList(latest)
2018-08-24 11:44:20 +08:00
for i, line in enumerate(lines):
current_line = line.strip()
public_suffix = psl.publicsuffix(current_line)
if public_suffix == current_line:
print(
f"The line number {i+1} contains just a public suffix: {current_line}"
)
suffix_detected = True
2018-02-05 11:09:52 +08:00
if suffix_detected:
2018-08-24 11:44:20 +08:00
print(
"At least one valid public suffix found in {!r}, please "
"remove it. See https://publicsuffix.org for details on why this "
"shouldn't be blocklisted.".format(filename)
2018-08-24 11:44:20 +08:00
)
2018-02-05 11:09:52 +08:00
sys.exit(1)
2018-02-05 10:22:36 +08:00
2018-02-05 12:55:37 +08:00
def check_for_third_level_domains(filename):
with open("public_suffix_list.dat", "r") as latest:
psl = PublicSuffixList(latest)
invalid = {
line
for line in files[filename]
if len(psl.privateparts(line.strip())) > 1
}
if invalid:
print("The following domains contain a third or lower level domain in {!r}:".format(filename))
for line in sorted(invalid):
print("* {}".format(line))
sys.exit(1)
2018-08-24 11:44:20 +08:00
def check_for_non_lowercase(filename):
lines = files[filename]
invalid = set(lines) - set(line.lower() for line in lines)
if invalid:
print("The following domains should be lowercased in {!r}:".format(filename))
for line in sorted(invalid):
print("* {}".format(line))
sys.exit(1)
def check_for_duplicates(filename):
lines = files[filename]
count = Counter(lines) - Counter(set(lines))
if count:
print("The following domains appear twice in {!r}:".format(filename))
for line in sorted(count):
print("* {}".format(line))
sys.exit(1)
def check_sort_order(filename):
lines = files[filename]
for a, b in zip(lines, sorted(lines)):
if a != b:
print("The list is not sorted in {!r}:".format(filename))
print("* {!r} should come before {!r}".format(b, a))
sys.exit(1)
def check_for_intersection(filename_a, filename_b):
a = files[filename_a]
b = files[filename_b]
intersection = set(a) & set(b)
if intersection:
print("The following domains appear in both lists:")
for line in sorted(intersection):
print("* {}".format(line))
2020-02-13 04:42:30 +08:00
sys.exit(1)
2018-02-05 12:55:37 +08:00
2018-02-05 10:22:36 +08:00
if __name__ == "__main__":
# Download the list of public suffixes
download_suffixes()
2018-08-24 11:44:20 +08:00
# Check if any domains have a public suffix
check_for_public_suffixes(blocklist)
2018-08-24 11:44:20 +08:00
# Check if any domains are a third or lower level domain
check_for_third_level_domains(blocklist)
2018-08-24 11:44:20 +08:00
# Check if any domains are not lowercase
check_for_non_lowercase(allowlist)
check_for_non_lowercase(blocklist)
2018-08-24 11:44:20 +08:00
# Check if any domains are duplicated in the same list
check_for_duplicates(allowlist)
check_for_duplicates(blocklist)
2018-08-24 11:44:20 +08:00
# Check if any lists are not sorted
check_sort_order(allowlist)
check_sort_order(blocklist)
2018-08-24 11:44:20 +08:00
# Check if any domains are in both the allowlist and blocklist
check_for_intersection(allowlist, blocklist)
print("All domain entries seem valid.")