diff --git a/hyperdia.py b/hyperdia.py index f6b9738..fa4835f 100644 --- a/hyperdia.py +++ b/hyperdia.py @@ -3,16 +3,20 @@ from dataclasses import dataclass from datetime import datetime from itertools import zip_longest -from typing import NamedTuple, Optional +import re +from typing import NamedTuple, Optional, List from urllib.parse import urlparse, urlencode, urlunparse from bs4 import BeautifulSoup import more_itertools as mlt +import pandas as pd import pytz import requests +from tabulate import tabulate HYPERDIA_CGI = "http://www.hyperdia.com/en/cgi/search/en/hyperdia2.cgi" HYPERDIA_SEARCH = "http://www.hyperdia.com/en/cgi/en/search.html" +GROUP_MATCHER = re.compile(r".*No\.(?P[0-9]{1,}).*") HYPERDIA_PARAMS = { "dep_node": "", @@ -58,22 +62,22 @@ class HyperdiaStep: duration: Optional[str] = None train_name: Optional[str] = None is_transfer: Optional[bool] = False + start_track_number: Optional[int] = None + end_track_number: Optional[int] = None -def pairwise(iterable): - "s -> (s0, s1), (s2, s3), (s4, s5), ..." - a = iter(iterable) - return zip(a, a) +@dataclass +class HyperdiaTrip: - -def grouped(iterable, n): - """s -> (s0,s1,s2,...sn-1), (sn,sn+1,sn+2,...s2n-1), - (s2n,s2n+1,s2n+2,...s3n-1), ...""" - return zip(*[iter(iterable)]*n) + steps: List[HyperdiaStep] + total_distance: int + total_time: int + total_cost: int + transfers: int def get_hyperdia_data(start_station, end_station, hour, minute, day="15", - month="08", year="2020", via=None): + month="08", year="2020", max_route=5, via=None): session = requests.Session() post_params = HYPERDIA_PARAMS.copy() @@ -86,6 +90,7 @@ def get_hyperdia_data(start_station, end_station, hour, minute, day="15", post_params["month"] = month post_params["hour"] = hour post_params["minute"] = minute + post_params["max_route"] = max_route if via is None: for element in ("via_node01", "via_node02", "via_node03"): @@ -98,7 +103,7 @@ def get_hyperdia_data(start_station, end_station, hour, minute, day="15", for node, station in zip_longest( via, ("via_node01", "via_node02", "via_node03"), - fill_value=""): + fill_value=""): post_params[node] = station @@ -113,34 +118,24 @@ def get_hyperdia_data(start_station, end_station, hour, minute, day="15", return result -#TODO: Adjust this, use the Firefox inspector -# For now, keep this in mind: -# Odd rows per result: stations -# Even rows: Train names, transfers... +def parse_hyperdia_heading(soup): -def parse_hyperdia_heading(soup, fare_number=1): + # Heading (div class="title_r") with this structure: + # First span: total time in minutes + # Second span: number of transfers + # Third span: total distance in Km + # Fourth span: total cost in JPY - data = dict() + elements = soup.select("span")[0:4] - mapping = {1: "total_time", 2: "transfer_num", 3: "total_distance"} + total_time, transfers, distance, cost = [item.text.strip() + for item in elements] - counter = 1 + cost = int(cost.replace(",", "")) - for element in soup.find_all("span", class_="text_blue"): - - if counter > 3: - break - - data[mapping[counter]] = element.text - counter += 1 - - fare = soup.find("span", {"class": "text_blue", - "id": f"fare_total{fare_number}"}) - fare = int(fare.text.replace(",", "")) - data["total_fare"] = fare - - return data + return {"total_time": total_time, "transfers": transfers, + "total_distance": distance, "total_cost": cost} def parse_station_time(element, year, month, day, start=True): @@ -150,8 +145,12 @@ def parse_station_time(element, year, month, day, start=True): # Otherwise we get the only item current_time = times[-1] if start else times[0] - station_time = datetime(year, month, day, int(current_time.split(":")[0]), - int(current_time.split(":")[1]), + + hour, minutes = current_time.split(":") + + station_time = datetime(year, int(month), int(day), + int(hour), + int(minutes), tzinfo=pytz.timezone("Japan")) return station_time @@ -165,6 +164,20 @@ def parse_train_name(element): return list(selected_item.stripped_strings)[0] +def parse_track_number(element): + + # Second span in the station name column contains the track number + # if applicable (if not, it's empty) + + track_data = element.select("span")[1].text + + if not track_data: + return None + + track_number = int(GROUP_MATCHER.search(track_data)["tracknum"]) + return track_number + + def parse_hyperdia_table(soup, year, month, day): data = list() @@ -186,6 +199,12 @@ def parse_hyperdia_table(soup, year, month, day): enddata = end_info.find_all("td")[0:3] # Ignore "add to favorities" start_station_name = list(startdata[2].stripped_strings)[0] + + # Second span in the station name column contains the track number + # if applicable (if not, it's empty) + start_track_number = parse_track_number(startdata[2]) + end_track_number = parse_track_number(enddata[2]) + start_station_time = parse_station_time(startdata[0], year, month, day, start=True) train_name = parse_train_name(traindata) @@ -203,31 +222,87 @@ def parse_hyperdia_table(soup, year, month, day): end_time=end_station_time, train_name=train_name, is_transfer=is_transfer, - duration=duration) + duration=duration, + start_track_number=start_track_number, + end_track_number=end_track_number) data.append(entry) return data -def parse_hyperdia_html(soup): +def parse_hyperdia_html(soup, *args, **kwargs): tables = soup.find_all("table", {"class": "table"}) - titles = soup.find_all("div", {"class": "title2"}) + headings = soup.find_all("div", {"class": "title_r"}) results = list() - for data in tables: - properties = {} - extracted = data.find_all( - "span", {"class": ["text_16", - "text_blue_l", "text_blue_p"]}) - parsed = list(pairwise(extracted)) - start = parsed[0] - end = parsed[-1] - properties["start"] = start[1].text - properties["starttime"] = start[0].text - properties["end"] = end[1].text - properties["endtime"] = end[0].text.strip() - results.append(properties) + for heading, table in zip(headings, tables): + + parsed_heading = parse_hyperdia_heading(heading) + parsed_table = parse_hyperdia_table(table, *args, **kwargs) + + trip = HyperdiaTrip(steps=parsed_table, **parsed_heading) + results.append(trip) + return results + + +def convert_trip_to_table(trip: HyperdiaTrip) -> pd.DataFrame: + + columns = ["From", "Departure time", "Departure track", + "To", "Arrival time", "Arrival track", "Duration", + "Train / Transfer"] + + rows = list() + + for element in trip.steps: + + start_track_number = ("-" if not element.start_track_number + else f"{element.start_track_number:.0f}") + end_track_number = ("-" if not element.end_track_number + else f"{element.end_track_number:.0f}") + + row = (element.start_station, + f"{element.start_time: %H:%M}", + start_track_number, + element.end_station, + f"{element.end_time: %H:%M}", + end_track_number, + f"{element.duration:.0f} minutes", + element.train_name) + + rows.append(row) + + df = pd.DataFrame.from_records(rows, columns=columns) + df = df.fillna("-") + + return df + + +def trip_summary(trip: HyperdiaTrip) -> str: + + table = convert_trip_to_table(trip) + table = tabulate(table, tablefmt="github", headers="keys", showindex=False) + + summary = (f"Total time: {trip.total_time} minutes," + f" Total distance: {trip.total_distance}," + f" Total cost {trip.total_cost} JPY") + + return table + "\n\n" + summary + "\n\n" + + +def hyperdia_search(start_station: str, end_station: str, hour: int, + minute: int, day: int = "15", month: str = "08", + year: int = 2020, max_route: int = 5, + via: List[str] = None) -> List[str]: + + raw_result = get_hyperdia_data(start_station, end_station, + hour, minute, day, month, year, max_route, + via) + soup = BeautifulSoup(raw_result.text, "html.parser") + results = parse_hyperdia_html(soup, year=year, month=month, day=day) + + for trip in results: + print(trip_summary(trip))