#!/bin/python import pdfplumber # 1. Parse pdf file to extract the coordinates coord_str = "" input_file = "circlek.pdf" with pdfplumber.open(input_file) as pdf: coord_str = pdf.pages[0].extract_text_simple().strip() print(f"1. coords are: {coord_str}") # 2. Search for the address that corresponds to this particular location # (Nominatim does not require API key, LocationIQ # and OpenCage Geocoder do not require credit card number to access their API) # 2.1. convert minutes and seconds to fractional degrees def geo_convert(dms_str): parts = dms_str.replace("°", " ").replace("'", " ").replace('"', " ").split() degrees, minutes, seconds, direction = parts decimal = float(degrees) + float(minutes) / 60 + float(seconds) / 3600 if direction in ['S', 'W']: decimal *= -1 return decimal lat_str, lon_str = coord_str.split() lat = geo_convert(lat_str) lon = geo_convert(lon_str) # 2.2 call the api import requests import time req_params = { 'lat': lat, 'lon': lon, 'format': 'json' } req_headers = { 'User-Agent': 'ondrej hladuvka (mail@hladu.xyz)' } tries = 5 timebreak = 1 for i in range(tries): response = requests.get("https://nominatim.openstreetmap.org/reverse", params=req_params, headers=req_headers) if response.status_code == 200: break time.sleep(timebreak) if response is None or response.status_code != 200: print("openstreetmap api: not avaliable") exit(1) # 2.3 parse response resp_data = response.json() if not resp_data or 'address' not in resp_data: print("openstreetmap api: no address returned") exit(1) address = resp_data['address'] if 'road' not in address or 'quarter' not in address or 'city' not in address: print("openstreetmap api: cannot parse address") exit(1) print(f"2. address is: {address['road']} {address['quarter']}, {address['city']}") # 3. Investigate what business is located at that particular address if not resp_data or 'name' not in resp_data: print("openstreetmap api: no business returned") exit(1) bsn_name = resp_data['name'] print(f"3. business is: {bsn_name}") # 4. Search for the web-page of this organization # (a simple request to google.com/search?q=… and CircleK will be the first result ## i didnt figure out how to overcome googles anti-scraping ## so im using duckduckgo insted data = {'q': bsn_name + " ee"} # fix to get the same first result as google ;) for i in range(tries): response = requests.post("https://html.duckduckgo.com/html/", data=data, headers=req_headers) if response.status_code == 200: break time.sleep(timebreak) if response is None or response.status_code != 200: print("duckduckgo: not avaliable") exit(1) # 4.1. parse html links <- duckduckgo has rel="nofollow" with all results html = response.text results = [] start = 0 while True: anchor_start = html.find('