final test
This commit is contained in:
parent
4e9e2dc4be
commit
1b854d6a20
Binary file not shown.
|
@ -0,0 +1,169 @@
|
|||
#!/bin/python
|
||||
import pdfplumber
|
||||
|
||||
# 1. Parse pdf file to extract the coordinates
|
||||
coord_str = ""
|
||||
input_file = "circlek.pdf"
|
||||
with pdfplumber.open(input_file) as pdf:
|
||||
coord_str = pdf.pages[0].extract_text_simple().strip()
|
||||
|
||||
print(f"1. coords are: {coord_str}")
|
||||
|
||||
|
||||
# 2. Search for the address that corresponds to this particular location
|
||||
# (Nominatim does not require API key, LocationIQ
|
||||
# and OpenCage Geocoder do not require credit card number to access their API)
|
||||
|
||||
# 2.1. convert minutes and seconds to fractional degrees
|
||||
def geo_convert(dms_str):
|
||||
parts = dms_str.replace("°", " ").replace("'", " ").replace('"', " ").split()
|
||||
degrees, minutes, seconds, direction = parts
|
||||
decimal = float(degrees) + float(minutes) / 60 + float(seconds) / 3600
|
||||
if direction in ['S', 'W']:
|
||||
decimal *= -1
|
||||
return decimal
|
||||
|
||||
lat_str, lon_str = coord_str.split()
|
||||
lat = geo_convert(lat_str)
|
||||
lon = geo_convert(lon_str)
|
||||
|
||||
# 2.2 call the api
|
||||
import requests
|
||||
import time
|
||||
|
||||
req_params = {
|
||||
'lat': lat,
|
||||
'lon': lon,
|
||||
'format': 'json'
|
||||
}
|
||||
|
||||
req_headers = {
|
||||
'User-Agent': 'ondrej hladuvka (mail@hladu.xyz)'
|
||||
}
|
||||
|
||||
tries = 5
|
||||
timebreak = 1
|
||||
for i in range(tries):
|
||||
response = requests.get("https://nominatim.openstreetmap.org/reverse",
|
||||
params=req_params, headers=req_headers)
|
||||
if response.status_code == 200:
|
||||
break
|
||||
time.sleep(timebreak)
|
||||
|
||||
if response is None or response.status_code != 200:
|
||||
print("openstreetmap api: not avaliable")
|
||||
exit(1)
|
||||
|
||||
# 2.3 parse response
|
||||
resp_data = response.json()
|
||||
|
||||
if not resp_data or 'address' not in resp_data:
|
||||
print("openstreetmap api: no address returned")
|
||||
exit(1)
|
||||
|
||||
address = resp_data['address']
|
||||
if 'road' not in address or 'quarter' not in address or 'city' not in address:
|
||||
print("openstreetmap api: cannot parse address")
|
||||
exit(1)
|
||||
print(f"2. address is: {address['road']} {address['quarter']}, {address['city']}")
|
||||
|
||||
|
||||
|
||||
# 3. Investigate what business is located at that particular address
|
||||
if not resp_data or 'name' not in resp_data:
|
||||
print("openstreetmap api: no business returned")
|
||||
exit(1)
|
||||
|
||||
bsn_name = resp_data['name']
|
||||
|
||||
print(f"3. business is: {bsn_name}")
|
||||
|
||||
|
||||
|
||||
# 4. Search for the web-page of this organization
|
||||
# (a simple request to google.com/search?q=… and CircleK will be the first result
|
||||
|
||||
## i didnt figure out how to overcome googles anti-scraping
|
||||
## so im using duckduckgo insted
|
||||
|
||||
data = {'q': bsn_name + " ee"} # fix to get the same first result as google ;)
|
||||
for i in range(tries):
|
||||
response = requests.post("https://html.duckduckgo.com/html/",
|
||||
data=data, headers=req_headers)
|
||||
if response.status_code == 200:
|
||||
break
|
||||
time.sleep(timebreak)
|
||||
|
||||
if response is None or response.status_code != 200:
|
||||
print("duckduckgo: not avaliable")
|
||||
exit(1)
|
||||
|
||||
# 4.1. parse html links <- duckduckgo has rel="nofollow" with all results
|
||||
html = response.text
|
||||
results = []
|
||||
start = 0
|
||||
|
||||
while True:
|
||||
anchor_start = html.find('<a rel="nofollow" href="', start)
|
||||
if anchor_start == -1:
|
||||
break
|
||||
href_start = anchor_start + len('<a rel="nofollow" href="')
|
||||
href_end = html.find('"', href_start)
|
||||
link = html[href_start:href_end]
|
||||
results.append(link)
|
||||
start = href_end
|
||||
|
||||
link=results[0]
|
||||
print(f"4. search result is: {link}")
|
||||
|
||||
|
||||
|
||||
# 5. Find out where their web-page is located, parse it,
|
||||
# extract the logo (its in svg format).
|
||||
|
||||
for i in range(tries):
|
||||
response = requests.post(link, headers=req_headers)
|
||||
if response.status_code == 200:
|
||||
break
|
||||
time.sleep(timebreak)
|
||||
|
||||
if response is None or response.status_code != 200:
|
||||
print(f"{link}: not avaliable")
|
||||
exit(1)
|
||||
|
||||
html = response.text
|
||||
|
||||
results = []
|
||||
start = 0
|
||||
while True:
|
||||
href_index = html.find('src="https://', start)
|
||||
if href_index == -1:
|
||||
break
|
||||
start_quote = href_index + len('src="')
|
||||
end_quote = html.find('"', start_quote)
|
||||
link = html[start_quote:end_quote]
|
||||
if link.endswith('.svg') and 'logo' in link:
|
||||
results.append(link)
|
||||
start = end_quote
|
||||
|
||||
if len(results) == 0:
|
||||
print(f"{link}: cannot find any svg resource named logo")
|
||||
exit(1)
|
||||
|
||||
logo_link = results[0]
|
||||
print(f"5. logo found at: {logo_link}")
|
||||
|
||||
|
||||
# 5.1. open the logo ;)
|
||||
ui_input = input("do you want to open it? [y/N] ")
|
||||
if 'y' not in ui_input and 'Y' not in ui_input:
|
||||
exit(0)
|
||||
|
||||
import os
|
||||
|
||||
if os.name == 'posix':
|
||||
os.system(f'xdg-open "{logo_link}"')
|
||||
elif os.name == 'nt': # hope this works, i dont have win machine to test this on :)
|
||||
os.system(f'start "" "{logo_link}"')
|
||||
else:
|
||||
print("sorry, cant do this on your system :c")
|
Loading…
Reference in New Issue