1
0
Fork 0

Basic working implementation

Only the command line interface is missing.
This commit is contained in:
Luca Beltrame 2020-01-05 19:36:48 +01:00
parent 464528f698
commit 05bf4b46ca
Signed by: einar
GPG key ID: 8DF631FD021DB0C5

View file

@ -3,16 +3,20 @@
from dataclasses import dataclass
from datetime import datetime
from itertools import zip_longest
from typing import NamedTuple, Optional
import re
from typing import NamedTuple, Optional, List
from urllib.parse import urlparse, urlencode, urlunparse
from bs4 import BeautifulSoup
import more_itertools as mlt
import pandas as pd
import pytz
import requests
from tabulate import tabulate
HYPERDIA_CGI = "http://www.hyperdia.com/en/cgi/search/en/hyperdia2.cgi"
HYPERDIA_SEARCH = "http://www.hyperdia.com/en/cgi/en/search.html"
GROUP_MATCHER = re.compile(r".*No\.(?P<tracknum>[0-9]{1,}).*")
HYPERDIA_PARAMS = {
"dep_node": "",
@ -58,22 +62,22 @@ class HyperdiaStep:
duration: Optional[str] = None
train_name: Optional[str] = None
is_transfer: Optional[bool] = False
start_track_number: Optional[int] = None
end_track_number: Optional[int] = None
def pairwise(iterable):
"s -> (s0, s1), (s2, s3), (s4, s5), ..."
a = iter(iterable)
return zip(a, a)
@dataclass
class HyperdiaTrip:
def grouped(iterable, n):
"""s -> (s0,s1,s2,...sn-1), (sn,sn+1,sn+2,...s2n-1),
(s2n,s2n+1,s2n+2,...s3n-1), ..."""
return zip(*[iter(iterable)]*n)
steps: List[HyperdiaStep]
total_distance: int
total_time: int
total_cost: int
transfers: int
def get_hyperdia_data(start_station, end_station, hour, minute, day="15",
month="08", year="2020", via=None):
month="08", year="2020", max_route=5, via=None):
session = requests.Session()
post_params = HYPERDIA_PARAMS.copy()
@ -86,6 +90,7 @@ def get_hyperdia_data(start_station, end_station, hour, minute, day="15",
post_params["month"] = month
post_params["hour"] = hour
post_params["minute"] = minute
post_params["max_route"] = max_route
if via is None:
for element in ("via_node01", "via_node02", "via_node03"):
@ -98,7 +103,7 @@ def get_hyperdia_data(start_station, end_station, hour, minute, day="15",
for node, station in zip_longest(
via,
("via_node01", "via_node02", "via_node03"),
fill_value=""):
fill_value=""):
post_params[node] = station
@ -113,34 +118,24 @@ def get_hyperdia_data(start_station, end_station, hour, minute, day="15",
return result
#TODO: Adjust this, use the Firefox inspector
# For now, keep this in mind:
# Odd rows per result: stations
# Even rows: Train names, transfers...
def parse_hyperdia_heading(soup):
def parse_hyperdia_heading(soup, fare_number=1):
# Heading (div class="title_r") with this structure:
# First span: total time in minutes
# Second span: number of transfers
# Third span: total distance in Km
# Fourth span: total cost in JPY
data = dict()
elements = soup.select("span")[0:4]
mapping = {1: "total_time", 2: "transfer_num", 3: "total_distance"}
total_time, transfers, distance, cost = [item.text.strip()
for item in elements]
counter = 1
cost = int(cost.replace(",", ""))
for element in soup.find_all("span", class_="text_blue"):
if counter > 3:
break
data[mapping[counter]] = element.text
counter += 1
fare = soup.find("span", {"class": "text_blue",
"id": f"fare_total{fare_number}"})
fare = int(fare.text.replace(",", ""))
data["total_fare"] = fare
return data
return {"total_time": total_time, "transfers": transfers,
"total_distance": distance, "total_cost": cost}
def parse_station_time(element, year, month, day, start=True):
@ -150,8 +145,12 @@ def parse_station_time(element, year, month, day, start=True):
# Otherwise we get the only item
current_time = times[-1] if start else times[0]
station_time = datetime(year, month, day, int(current_time.split(":")[0]),
int(current_time.split(":")[1]),
hour, minutes = current_time.split(":")
station_time = datetime(year, int(month), int(day),
int(hour),
int(minutes),
tzinfo=pytz.timezone("Japan"))
return station_time
@ -165,6 +164,20 @@ def parse_train_name(element):
return list(selected_item.stripped_strings)[0]
def parse_track_number(element):
# Second span in the station name column contains the track number
# if applicable (if not, it's empty)
track_data = element.select("span")[1].text
if not track_data:
return None
track_number = int(GROUP_MATCHER.search(track_data)["tracknum"])
return track_number
def parse_hyperdia_table(soup, year, month, day):
data = list()
@ -186,6 +199,12 @@ def parse_hyperdia_table(soup, year, month, day):
enddata = end_info.find_all("td")[0:3]
# Ignore "add to favorities"
start_station_name = list(startdata[2].stripped_strings)[0]
# Second span in the station name column contains the track number
# if applicable (if not, it's empty)
start_track_number = parse_track_number(startdata[2])
end_track_number = parse_track_number(enddata[2])
start_station_time = parse_station_time(startdata[0], year, month, day,
start=True)
train_name = parse_train_name(traindata)
@ -203,31 +222,87 @@ def parse_hyperdia_table(soup, year, month, day):
end_time=end_station_time,
train_name=train_name,
is_transfer=is_transfer,
duration=duration)
duration=duration,
start_track_number=start_track_number,
end_track_number=end_track_number)
data.append(entry)
return data
def parse_hyperdia_html(soup):
def parse_hyperdia_html(soup, *args, **kwargs):
tables = soup.find_all("table", {"class": "table"})
titles = soup.find_all("div", {"class": "title2"})
headings = soup.find_all("div", {"class": "title_r"})
results = list()
for data in tables:
properties = {}
extracted = data.find_all(
"span", {"class": ["text_16",
"text_blue_l", "text_blue_p"]})
parsed = list(pairwise(extracted))
start = parsed[0]
end = parsed[-1]
properties["start"] = start[1].text
properties["starttime"] = start[0].text
properties["end"] = end[1].text
properties["endtime"] = end[0].text.strip()
results.append(properties)
for heading, table in zip(headings, tables):
parsed_heading = parse_hyperdia_heading(heading)
parsed_table = parse_hyperdia_table(table, *args, **kwargs)
trip = HyperdiaTrip(steps=parsed_table, **parsed_heading)
results.append(trip)
return results
def convert_trip_to_table(trip: HyperdiaTrip) -> pd.DataFrame:
columns = ["From", "Departure time", "Departure track",
"To", "Arrival time", "Arrival track", "Duration",
"Train / Transfer"]
rows = list()
for element in trip.steps:
start_track_number = ("-" if not element.start_track_number
else f"{element.start_track_number:.0f}")
end_track_number = ("-" if not element.end_track_number
else f"{element.end_track_number:.0f}")
row = (element.start_station,
f"{element.start_time: %H:%M}",
start_track_number,
element.end_station,
f"{element.end_time: %H:%M}",
end_track_number,
f"{element.duration:.0f} minutes",
element.train_name)
rows.append(row)
df = pd.DataFrame.from_records(rows, columns=columns)
df = df.fillna("-")
return df
def trip_summary(trip: HyperdiaTrip) -> str:
table = convert_trip_to_table(trip)
table = tabulate(table, tablefmt="github", headers="keys", showindex=False)
summary = (f"Total time: {trip.total_time} minutes,"
f" Total distance: {trip.total_distance},"
f" Total cost {trip.total_cost} JPY")
return table + "\n\n" + summary + "\n\n"
def hyperdia_search(start_station: str, end_station: str, hour: int,
minute: int, day: int = "15", month: str = "08",
year: int = 2020, max_route: int = 5,
via: List[str] = None) -> List[str]:
raw_result = get_hyperdia_data(start_station, end_station,
hour, minute, day, month, year, max_route,
via)
soup = BeautifulSoup(raw_result.text, "html.parser")
results = parse_hyperdia_html(soup, year=year, month=month, day=day)
for trip in results:
print(trip_summary(trip))