Source code for utils

import requests
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

[docs] def ask_geonames(params): url = 'http://api.geonames.org/searchJSON' response = requests.get(url, params=params) data = response.json() return data
[docs] def check_url(url): try: response = requests.head(url, timeout=10) return url, response.status_code except requests.RequestException: return url, 'Error'
[docs] def verify_uris(csv_file, uri_column='uri'): # Read the CSV file df = pd.read_csv(csv_file) # Get unique URIs uris = df[uri_column].unique() results = [] with ThreadPoolExecutor(max_workers=10) as executor: # Create a list of futures future_to_url = {executor.submit(check_url, url): url for url in uris} for future in tqdm(as_completed(future_to_url), total=len(uris), desc="Checking URIs"): url, status = future.result() results.append({'uri': url, 'status': status}) results_df = pd.DataFrame(results) total_uris = len(results_df) error_uris = results_df[results_df['status'] == 'Error'] not_found_uris = results_df[results_df['status'] == 404] print(f"\nTotal URIs checked: {total_uris}") print(f"URIs returning errors: {len(error_uris)}") print(f"URIs returning 404 Not Found: {len(not_found_uris)}") # show issues if not error_uris.empty: print("\nURIs with errors:") print(error_uris['uri'].tolist()) if not not_found_uris.empty: print("\nURIs returning 404 Not Found:") print(not_found_uris['uri'].tolist()) # save results_df.to_csv('uri_verification_results.csv', index=False) print("\nDetailed results saved to 'uri_verification_results.csv'")