diff --git a/.github/workflows/url-validation.yml b/.github/workflows/url-validation.yml new file mode 100644 index 000000000..9818ba4f7 --- /dev/null +++ b/.github/workflows/url-validation.yml @@ -0,0 +1,26 @@ +name: URL Validation +on: + schedule: + - cron: '0 0 * * 1' # At 00:00 on Monday + workflow_dispatch: + +jobs: + validation: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.head.sha }} + fetch-depth: 0 + - name: Setup Up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - run: pip install requests + - run: python tests/url-validation.py + - name: Upload Artifact + uses: actions/upload-artifact@v3 + with: + name: URL Validiation Result + path: url-validation-result.json + retention-days: 7 diff --git a/tests/url-validation.py b/tests/url-validation.py new file mode 100644 index 000000000..25d2f7b70 --- /dev/null +++ b/tests/url-validation.py @@ -0,0 +1,60 @@ +import os +import json +import requests +import concurrent.futures + +repository_directory = os.getcwd() +domains_directory = os.path.join(repository_directory, "domains") + +def has_url_field(file_path): + with open(file_path, "r") as file: + data = json.load(file) + record = data.get("record") + if record and "URL" in record: + return record["URL"] + return None + +def is_url_reachable(url: str): + try: + response = requests.head(url, allow_redirects=True) + return response.status_code // 100 in [1, 2, 3] # Check if status code is in the 1xx or 2xx or 3xx range (success) + except requests.exceptions.RequestException: + return False + +urls_data = { + "valid": {}, + "invalid": {}, + "non-http": {} +} + +def handle_url_validation(file_path): + url: str = has_url_field(file_path) + if url: + if url.startswith("http://") or url.startswith("https://"): + if is_url_reachable(url): + urls_data["valid"][file_path] = url + print(f"URL '{url}' in file '{file_path}' is reachable.") + else: + urls_data["invalid"][file_path] = url + print(f"URL '{url}' in file '{file_path}' is not reachable.") + else: + urls_data["non-http"][file_path] = url + print(f"URL '{url}' in file '{file_path}' is neither HTTP nor HTTPS.") + +max_threads = 20 + +with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: + file_paths = [] + + for root, _, files in os.walk(domains_directory): + for filename in files: + file_paths.append(os.path.join(root, filename)) + + for file_path in file_paths: + future = executor.submit(handle_url_validation, file_path) + +result_file_path = os.path.join(repository_directory, "url-validation-result.json") +with open(result_file_path, "w") as result_file: + json.dump(urls_data, result_file, indent=4) + +print(f"Results saved to {result_file_path}")