diff --git a/final/circlek.pdf b/final/circlek.pdf new file mode 100644 index 0000000..b741752 Binary files /dev/null and b/final/circlek.pdf differ diff --git a/final/final.py b/final/final.py new file mode 100755 index 0000000..8131789 --- /dev/null +++ b/final/final.py @@ -0,0 +1,169 @@ +#!/bin/python +import pdfplumber + +# 1. Parse pdf file to extract the coordinates +coord_str = "" +input_file = "circlek.pdf" +with pdfplumber.open(input_file) as pdf: + coord_str = pdf.pages[0].extract_text_simple().strip() + +print(f"1. coords are: {coord_str}") + + +# 2. Search for the address that corresponds to this particular location +# (Nominatim does not require API key, LocationIQ +# and OpenCage Geocoder do not require credit card number to access their API) + +# 2.1. convert minutes and seconds to fractional degrees +def geo_convert(dms_str): + parts = dms_str.replace("°", " ").replace("'", " ").replace('"', " ").split() + degrees, minutes, seconds, direction = parts + decimal = float(degrees) + float(minutes) / 60 + float(seconds) / 3600 + if direction in ['S', 'W']: + decimal *= -1 + return decimal + +lat_str, lon_str = coord_str.split() +lat = geo_convert(lat_str) +lon = geo_convert(lon_str) + +# 2.2 call the api +import requests +import time + +req_params = { + 'lat': lat, + 'lon': lon, + 'format': 'json' +} + +req_headers = { + 'User-Agent': 'ondrej hladuvka (mail@hladu.xyz)' +} + +tries = 5 +timebreak = 1 +for i in range(tries): + response = requests.get("https://nominatim.openstreetmap.org/reverse", + params=req_params, headers=req_headers) + if response.status_code == 200: + break + time.sleep(timebreak) + +if response is None or response.status_code != 200: + print("openstreetmap api: not avaliable") + exit(1) + +# 2.3 parse response +resp_data = response.json() + +if not resp_data or 'address' not in resp_data: + print("openstreetmap api: no address returned") + exit(1) + +address = resp_data['address'] +if 'road' not in address or 'quarter' not in address or 'city' not in address: + print("openstreetmap api: cannot parse address") + exit(1) +print(f"2. address is: {address['road']} {address['quarter']}, {address['city']}") + + + +# 3. Investigate what business is located at that particular address +if not resp_data or 'name' not in resp_data: + print("openstreetmap api: no business returned") + exit(1) + +bsn_name = resp_data['name'] + +print(f"3. business is: {bsn_name}") + + + +# 4. Search for the web-page of this organization +# (a simple request to google.com/search?q=… and CircleK will be the first result + +## i didnt figure out how to overcome googles anti-scraping +## so im using duckduckgo insted + +data = {'q': bsn_name + " ee"} # fix to get the same first result as google ;) +for i in range(tries): + response = requests.post("https://html.duckduckgo.com/html/", + data=data, headers=req_headers) + if response.status_code == 200: + break + time.sleep(timebreak) + +if response is None or response.status_code != 200: + print("duckduckgo: not avaliable") + exit(1) + +# 4.1. parse html links <- duckduckgo has rel="nofollow" with all results +html = response.text +results = [] +start = 0 + +while True: + anchor_start = html.find('