Files
is-a-dev/tests/url-validation.py
T
2023-09-23 23:03:49 +05:30

61 lines
1.8 KiB
Python

import os
import json
import requests
import concurrent.futures
repository_directory = os.getcwd()
domains_directory = os.path.join(repository_directory, "domains")
def has_url_field(file_path):
with open(file_path, "r") as file:
data = json.load(file)
record = data.get("record")
if record and "URL" in record:
return record["URL"]
return None
def is_url_reachable(url: str):
try:
response = requests.head(url, allow_redirects=True)
return response.status_code // 100 in [1, 2, 3] # Check if status code is in the 1xx or 2xx or 3xx range (success)
except requests.exceptions.RequestException:
return False
urls_data = {
"valid": {},
"invalid": {},
"non-http": {}
}
def handle_url_validation(file_path):
url: str = has_url_field(file_path)
if url:
if url.startswith("http://") or url.startswith("https://"):
if is_url_reachable(url):
urls_data["valid"][file_path] = url
print(f"URL '{url}' in file '{file_path}' is reachable.")
else:
urls_data["invalid"][file_path] = url
print(f"URL '{url}' in file '{file_path}' is not reachable.")
else:
urls_data["non-http"][file_path] = url
print(f"URL '{url}' in file '{file_path}' is neither HTTP nor HTTPS.")
max_threads = 20
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
file_paths = []
for root, _, files in os.walk(domains_directory):
for filename in files:
file_paths.append(os.path.join(root, filename))
for file_path in file_paths:
future = executor.submit(handle_url_validation, file_path)
result_file_path = os.path.join(repository_directory, "url-validation-result.json")
with open(result_file_path, "w") as result_file:
json.dump(urls_data, result_file, indent=4)
print(f"Results saved to {result_file_path}")