233 lines
6.6 KiB
Python
233 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
|
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from itertools import zip_longest
|
|
from typing import NamedTuple, Optional
|
|
from urllib.parse import urlparse, urlencode, urlunparse
|
|
|
|
from bs4 import BeautifulSoup
|
|
import more_itertools as mlt
|
|
import pytz
|
|
import requests
|
|
|
|
HYPERDIA_CGI = "http://www.hyperdia.com/en/cgi/search/en/hyperdia2.cgi"
|
|
HYPERDIA_SEARCH = "http://www.hyperdia.com/en/cgi/en/search.html"
|
|
|
|
HYPERDIA_PARAMS = {
|
|
"dep_node": "",
|
|
"arv_node": "",
|
|
"year": "",
|
|
"month": "",
|
|
"day": "",
|
|
"hour": "",
|
|
"minute": "",
|
|
"search_type": "0",
|
|
"transtime": "undefined",
|
|
"max_route": "5",
|
|
"sort": "0",
|
|
"faretype": "0",
|
|
"ship": "off",
|
|
"lmlimit": None,
|
|
"sum_target": "7",
|
|
"facility": "reserved",
|
|
"search_target": "route",
|
|
"sprexprs": "on",
|
|
"sprnozomi": "on",
|
|
"slpexprs": "on",
|
|
"jr": "on",
|
|
"privately": "on",
|
|
"search_way": ""
|
|
}
|
|
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0',
|
|
'Content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
|
'Host': 'www.hyperdia.com',
|
|
'Origin': 'http://www.hyperdia.com'
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class HyperdiaStep:
|
|
|
|
start_station: str
|
|
end_station: str
|
|
start_time: datetime
|
|
end_time: datetime
|
|
duration: Optional[str] = None
|
|
train_name: Optional[str] = None
|
|
is_transfer: Optional[bool] = False
|
|
|
|
|
|
def pairwise(iterable):
|
|
"s -> (s0, s1), (s2, s3), (s4, s5), ..."
|
|
a = iter(iterable)
|
|
return zip(a, a)
|
|
|
|
|
|
def grouped(iterable, n):
|
|
"""s -> (s0,s1,s2,...sn-1), (sn,sn+1,sn+2,...s2n-1),
|
|
(s2n,s2n+1,s2n+2,...s3n-1), ..."""
|
|
return zip(*[iter(iterable)]*n)
|
|
|
|
|
|
def get_hyperdia_data(start_station, end_station, hour, minute, day="15",
|
|
month="08", year="2020", via=None):
|
|
|
|
session = requests.Session()
|
|
post_params = HYPERDIA_PARAMS.copy()
|
|
headers = HEADERS.copy()
|
|
|
|
post_params["dep_node"] = start_station
|
|
post_params["arv_node"] = end_station
|
|
post_params["year"] = year
|
|
post_params["day"] = day
|
|
post_params["month"] = month
|
|
post_params["hour"] = hour
|
|
post_params["minute"] = minute
|
|
|
|
if via is None:
|
|
for element in ("via_node01", "via_node02", "via_node03"):
|
|
post_params[element] = ""
|
|
else:
|
|
|
|
if len(via) > 3:
|
|
raise ValueError("Only up to three through stations are allowed")
|
|
|
|
for node, station in zip_longest(
|
|
via,
|
|
("via_node01", "via_node02", "via_node03"),
|
|
fill_value=""):
|
|
|
|
post_params[node] = station
|
|
|
|
referer = list(urlparse(HYPERDIA_SEARCH))
|
|
referer[4] = urlencode(post_params)
|
|
referer = urlunparse(referer)
|
|
headers["Referer"] = referer
|
|
|
|
session.headers.update(headers)
|
|
|
|
result = session.post(HYPERDIA_CGI, data=post_params)
|
|
|
|
return result
|
|
|
|
#TODO: Adjust this, use the Firefox inspector
|
|
# For now, keep this in mind:
|
|
# Odd rows per result: stations
|
|
# Even rows: Train names, transfers...
|
|
|
|
|
|
def parse_hyperdia_heading(soup, fare_number=1):
|
|
|
|
data = dict()
|
|
|
|
mapping = {1: "total_time", 2: "transfer_num", 3: "total_distance"}
|
|
|
|
counter = 1
|
|
|
|
for element in soup.find_all("span", class_="text_blue"):
|
|
|
|
if counter > 3:
|
|
break
|
|
|
|
data[mapping[counter]] = element.text
|
|
counter += 1
|
|
|
|
fare = soup.find("span", {"class": "text_blue",
|
|
"id": f"fare_total{fare_number}"})
|
|
fare = int(fare.text.replace(",", ""))
|
|
data["total_fare"] = fare
|
|
|
|
return data
|
|
|
|
|
|
def parse_station_time(element, year, month, day, start=True):
|
|
|
|
times = list(element.stripped_strings)
|
|
# The first element if it's a transfer (arrival time; we ignore walking)
|
|
# Otherwise we get the only item
|
|
|
|
current_time = times[-1] if start else times[0]
|
|
station_time = datetime(year, month, day, int(current_time.split(":")[0]),
|
|
int(current_time.split(":")[1]),
|
|
tzinfo=pytz.timezone("Japan"))
|
|
|
|
return station_time
|
|
|
|
|
|
def parse_train_name(element):
|
|
|
|
# Trains are in a list with only one element, inside a span
|
|
selected_item = element.select("td > ul > li > span")[0]
|
|
|
|
return list(selected_item.stripped_strings)[0]
|
|
|
|
|
|
def parse_hyperdia_table(soup, year, month, day):
|
|
|
|
data = list()
|
|
|
|
previous = 0
|
|
|
|
# Skip the heading and the row immediately afterwards (commuter pass)
|
|
for group in mlt.windowed(soup.find_all("tr")[2:], n=3, step=2):
|
|
|
|
# Groups of 3 elements:
|
|
# First row: start station (time in first column, station in column 3)
|
|
# Second row: train information (duration in column 1, name in column 3)
|
|
# Third row: arrival time(s) (same format as first row)
|
|
# Times might be repeated more than once if it's a transfer
|
|
|
|
start_info, journey_info, end_info = group
|
|
startdata = start_info.find_all("td")[0:3]
|
|
traindata = journey_info.find_all("td")[2]
|
|
enddata = end_info.find_all("td")[0:3]
|
|
# Ignore "add to favorities"
|
|
start_station_name = list(startdata[2].stripped_strings)[0]
|
|
start_station_time = parse_station_time(startdata[0], year, month, day,
|
|
start=True)
|
|
train_name = parse_train_name(traindata)
|
|
end_station_name = list(enddata[2].stripped_strings)[0]
|
|
end_station_time = parse_station_time(enddata[0], year, month, day,
|
|
start=False)
|
|
|
|
is_transfer = True if train_name == "Walk" else False
|
|
duration = ((end_station_time - start_station_time).seconds / 60) % 60
|
|
|
|
entry = HyperdiaStep(
|
|
start_station=start_station_name,
|
|
end_station=end_station_name,
|
|
start_time=start_station_time,
|
|
end_time=end_station_time,
|
|
train_name=train_name,
|
|
is_transfer=is_transfer,
|
|
duration=duration)
|
|
|
|
data.append(entry)
|
|
|
|
return data
|
|
|
|
|
|
def parse_hyperdia_html(soup):
|
|
|
|
tables = soup.find_all("table", {"class": "table"})
|
|
titles = soup.find_all("div", {"class": "title2"})
|
|
|
|
results = list()
|
|
for data in tables:
|
|
properties = {}
|
|
extracted = data.find_all(
|
|
"span", {"class": ["text_16",
|
|
"text_blue_l", "text_blue_p"]})
|
|
parsed = list(pairwise(extracted))
|
|
start = parsed[0]
|
|
end = parsed[-1]
|
|
|
|
properties["start"] = start[1].text
|
|
properties["starttime"] = start[0].text
|
|
properties["end"] = end[1].text
|
|
properties["endtime"] = end[0].text.strip()
|
|
results.append(properties)
|
|
return results
|