1
0
Fork 0

Basic working implementation

Only the command line interface is missing.
This commit is contained in:
Luca Beltrame 2020-01-05 19:36:48 +01:00
parent 464528f698
commit 05bf4b46ca
Signed by: einar
GPG key ID: 8DF631FD021DB0C5

View file

@ -3,16 +3,20 @@
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from itertools import zip_longest from itertools import zip_longest
from typing import NamedTuple, Optional import re
from typing import NamedTuple, Optional, List
from urllib.parse import urlparse, urlencode, urlunparse from urllib.parse import urlparse, urlencode, urlunparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import more_itertools as mlt import more_itertools as mlt
import pandas as pd
import pytz import pytz
import requests import requests
from tabulate import tabulate
HYPERDIA_CGI = "http://www.hyperdia.com/en/cgi/search/en/hyperdia2.cgi" HYPERDIA_CGI = "http://www.hyperdia.com/en/cgi/search/en/hyperdia2.cgi"
HYPERDIA_SEARCH = "http://www.hyperdia.com/en/cgi/en/search.html" HYPERDIA_SEARCH = "http://www.hyperdia.com/en/cgi/en/search.html"
GROUP_MATCHER = re.compile(r".*No\.(?P<tracknum>[0-9]{1,}).*")
HYPERDIA_PARAMS = { HYPERDIA_PARAMS = {
"dep_node": "", "dep_node": "",
@ -58,22 +62,22 @@ class HyperdiaStep:
duration: Optional[str] = None duration: Optional[str] = None
train_name: Optional[str] = None train_name: Optional[str] = None
is_transfer: Optional[bool] = False is_transfer: Optional[bool] = False
start_track_number: Optional[int] = None
end_track_number: Optional[int] = None
def pairwise(iterable): @dataclass
"s -> (s0, s1), (s2, s3), (s4, s5), ..." class HyperdiaTrip:
a = iter(iterable)
return zip(a, a)
steps: List[HyperdiaStep]
def grouped(iterable, n): total_distance: int
"""s -> (s0,s1,s2,...sn-1), (sn,sn+1,sn+2,...s2n-1), total_time: int
(s2n,s2n+1,s2n+2,...s3n-1), ...""" total_cost: int
return zip(*[iter(iterable)]*n) transfers: int
def get_hyperdia_data(start_station, end_station, hour, minute, day="15", def get_hyperdia_data(start_station, end_station, hour, minute, day="15",
month="08", year="2020", via=None): month="08", year="2020", max_route=5, via=None):
session = requests.Session() session = requests.Session()
post_params = HYPERDIA_PARAMS.copy() post_params = HYPERDIA_PARAMS.copy()
@ -86,6 +90,7 @@ def get_hyperdia_data(start_station, end_station, hour, minute, day="15",
post_params["month"] = month post_params["month"] = month
post_params["hour"] = hour post_params["hour"] = hour
post_params["minute"] = minute post_params["minute"] = minute
post_params["max_route"] = max_route
if via is None: if via is None:
for element in ("via_node01", "via_node02", "via_node03"): for element in ("via_node01", "via_node02", "via_node03"):
@ -113,34 +118,24 @@ def get_hyperdia_data(start_station, end_station, hour, minute, day="15",
return result return result
#TODO: Adjust this, use the Firefox inspector
# For now, keep this in mind:
# Odd rows per result: stations
# Even rows: Train names, transfers...
def parse_hyperdia_heading(soup):
def parse_hyperdia_heading(soup, fare_number=1): # Heading (div class="title_r") with this structure:
# First span: total time in minutes
# Second span: number of transfers
# Third span: total distance in Km
# Fourth span: total cost in JPY
data = dict() elements = soup.select("span")[0:4]
mapping = {1: "total_time", 2: "transfer_num", 3: "total_distance"} total_time, transfers, distance, cost = [item.text.strip()
for item in elements]
counter = 1 cost = int(cost.replace(",", ""))
for element in soup.find_all("span", class_="text_blue"): return {"total_time": total_time, "transfers": transfers,
"total_distance": distance, "total_cost": cost}
if counter > 3:
break
data[mapping[counter]] = element.text
counter += 1
fare = soup.find("span", {"class": "text_blue",
"id": f"fare_total{fare_number}"})
fare = int(fare.text.replace(",", ""))
data["total_fare"] = fare
return data
def parse_station_time(element, year, month, day, start=True): def parse_station_time(element, year, month, day, start=True):
@ -150,8 +145,12 @@ def parse_station_time(element, year, month, day, start=True):
# Otherwise we get the only item # Otherwise we get the only item
current_time = times[-1] if start else times[0] current_time = times[-1] if start else times[0]
station_time = datetime(year, month, day, int(current_time.split(":")[0]),
int(current_time.split(":")[1]), hour, minutes = current_time.split(":")
station_time = datetime(year, int(month), int(day),
int(hour),
int(minutes),
tzinfo=pytz.timezone("Japan")) tzinfo=pytz.timezone("Japan"))
return station_time return station_time
@ -165,6 +164,20 @@ def parse_train_name(element):
return list(selected_item.stripped_strings)[0] return list(selected_item.stripped_strings)[0]
def parse_track_number(element):
# Second span in the station name column contains the track number
# if applicable (if not, it's empty)
track_data = element.select("span")[1].text
if not track_data:
return None
track_number = int(GROUP_MATCHER.search(track_data)["tracknum"])
return track_number
def parse_hyperdia_table(soup, year, month, day): def parse_hyperdia_table(soup, year, month, day):
data = list() data = list()
@ -186,6 +199,12 @@ def parse_hyperdia_table(soup, year, month, day):
enddata = end_info.find_all("td")[0:3] enddata = end_info.find_all("td")[0:3]
# Ignore "add to favorities" # Ignore "add to favorities"
start_station_name = list(startdata[2].stripped_strings)[0] start_station_name = list(startdata[2].stripped_strings)[0]
# Second span in the station name column contains the track number
# if applicable (if not, it's empty)
start_track_number = parse_track_number(startdata[2])
end_track_number = parse_track_number(enddata[2])
start_station_time = parse_station_time(startdata[0], year, month, day, start_station_time = parse_station_time(startdata[0], year, month, day,
start=True) start=True)
train_name = parse_train_name(traindata) train_name = parse_train_name(traindata)
@ -203,31 +222,87 @@ def parse_hyperdia_table(soup, year, month, day):
end_time=end_station_time, end_time=end_station_time,
train_name=train_name, train_name=train_name,
is_transfer=is_transfer, is_transfer=is_transfer,
duration=duration) duration=duration,
start_track_number=start_track_number,
end_track_number=end_track_number)
data.append(entry) data.append(entry)
return data return data
def parse_hyperdia_html(soup): def parse_hyperdia_html(soup, *args, **kwargs):
tables = soup.find_all("table", {"class": "table"}) tables = soup.find_all("table", {"class": "table"})
titles = soup.find_all("div", {"class": "title2"}) headings = soup.find_all("div", {"class": "title_r"})
results = list() results = list()
for data in tables:
properties = {}
extracted = data.find_all(
"span", {"class": ["text_16",
"text_blue_l", "text_blue_p"]})
parsed = list(pairwise(extracted))
start = parsed[0]
end = parsed[-1]
properties["start"] = start[1].text for heading, table in zip(headings, tables):
properties["starttime"] = start[0].text
properties["end"] = end[1].text parsed_heading = parse_hyperdia_heading(heading)
properties["endtime"] = end[0].text.strip() parsed_table = parse_hyperdia_table(table, *args, **kwargs)
results.append(properties)
trip = HyperdiaTrip(steps=parsed_table, **parsed_heading)
results.append(trip)
return results return results
def convert_trip_to_table(trip: HyperdiaTrip) -> pd.DataFrame:
columns = ["From", "Departure time", "Departure track",
"To", "Arrival time", "Arrival track", "Duration",
"Train / Transfer"]
rows = list()
for element in trip.steps:
start_track_number = ("-" if not element.start_track_number
else f"{element.start_track_number:.0f}")
end_track_number = ("-" if not element.end_track_number
else f"{element.end_track_number:.0f}")
row = (element.start_station,
f"{element.start_time: %H:%M}",
start_track_number,
element.end_station,
f"{element.end_time: %H:%M}",
end_track_number,
f"{element.duration:.0f} minutes",
element.train_name)
rows.append(row)
df = pd.DataFrame.from_records(rows, columns=columns)
df = df.fillna("-")
return df
def trip_summary(trip: HyperdiaTrip) -> str:
table = convert_trip_to_table(trip)
table = tabulate(table, tablefmt="github", headers="keys", showindex=False)
summary = (f"Total time: {trip.total_time} minutes,"
f" Total distance: {trip.total_distance},"
f" Total cost {trip.total_cost} JPY")
return table + "\n\n" + summary + "\n\n"
def hyperdia_search(start_station: str, end_station: str, hour: int,
minute: int, day: int = "15", month: str = "08",
year: int = 2020, max_route: int = 5,
via: List[str] = None) -> List[str]:
raw_result = get_hyperdia_data(start_station, end_station,
hour, minute, day, month, year, max_route,
via)
soup = BeautifulSoup(raw_result.text, "html.parser")
results = parse_hyperdia_html(soup, year=year, month=month, day=day)
for trip in results:
print(trip_summary(trip))