Rough hyperdia scraper for my own use
This commit is contained in:
		
					parent
					
						
							
								e1bd6c145a
							
						
					
				
			
			
				commit
				
					
						464528f698
					
				
			
		
					 1 changed files with 233 additions and 0 deletions
				
			
		
							
								
								
									
										233
									
								
								hyperdia.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										233
									
								
								hyperdia.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,233 @@
 | 
				
			||||||
 | 
					#!/usr/bin/env python3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from dataclasses import dataclass
 | 
				
			||||||
 | 
					from datetime import datetime
 | 
				
			||||||
 | 
					from itertools import zip_longest
 | 
				
			||||||
 | 
					from typing import NamedTuple, Optional
 | 
				
			||||||
 | 
					from urllib.parse import urlparse, urlencode, urlunparse
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					import more_itertools as mlt
 | 
				
			||||||
 | 
					import pytz
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					HYPERDIA_CGI = "http://www.hyperdia.com/en/cgi/search/en/hyperdia2.cgi"
 | 
				
			||||||
 | 
					HYPERDIA_SEARCH = "http://www.hyperdia.com/en/cgi/en/search.html"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					HYPERDIA_PARAMS = {
 | 
				
			||||||
 | 
					    "dep_node": "",
 | 
				
			||||||
 | 
					    "arv_node": "",
 | 
				
			||||||
 | 
					    "year": "",
 | 
				
			||||||
 | 
					    "month": "",
 | 
				
			||||||
 | 
					    "day": "",
 | 
				
			||||||
 | 
					    "hour": "",
 | 
				
			||||||
 | 
					    "minute": "",
 | 
				
			||||||
 | 
					    "search_type": "0",
 | 
				
			||||||
 | 
					    "transtime": "undefined",
 | 
				
			||||||
 | 
					    "max_route": "5",
 | 
				
			||||||
 | 
					    "sort": "0",
 | 
				
			||||||
 | 
					    "faretype": "0",
 | 
				
			||||||
 | 
					    "ship": "off",
 | 
				
			||||||
 | 
					    "lmlimit": None,
 | 
				
			||||||
 | 
					    "sum_target": "7",
 | 
				
			||||||
 | 
					    "facility": "reserved",
 | 
				
			||||||
 | 
					    "search_target": "route",
 | 
				
			||||||
 | 
					    "sprexprs": "on",
 | 
				
			||||||
 | 
					    "sprnozomi": "on",
 | 
				
			||||||
 | 
					    "slpexprs": "on",
 | 
				
			||||||
 | 
					    "jr": "on",
 | 
				
			||||||
 | 
					    "privately": "on",
 | 
				
			||||||
 | 
					    "search_way": ""
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					HEADERS = {
 | 
				
			||||||
 | 
					    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0',
 | 
				
			||||||
 | 
					    'Content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
 | 
				
			||||||
 | 
					    'Host': 'www.hyperdia.com',
 | 
				
			||||||
 | 
					    'Origin': 'http://www.hyperdia.com'
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@dataclass
 | 
				
			||||||
 | 
					class HyperdiaStep:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    start_station: str
 | 
				
			||||||
 | 
					    end_station: str
 | 
				
			||||||
 | 
					    start_time: datetime
 | 
				
			||||||
 | 
					    end_time: datetime
 | 
				
			||||||
 | 
					    duration: Optional[str] = None
 | 
				
			||||||
 | 
					    train_name: Optional[str] = None
 | 
				
			||||||
 | 
					    is_transfer: Optional[bool] = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def pairwise(iterable):
 | 
				
			||||||
 | 
					    "s -> (s0, s1), (s2, s3), (s4, s5), ..."
 | 
				
			||||||
 | 
					    a = iter(iterable)
 | 
				
			||||||
 | 
					    return zip(a, a)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def grouped(iterable, n):
 | 
				
			||||||
 | 
					    """s -> (s0,s1,s2,...sn-1), (sn,sn+1,sn+2,...s2n-1),
 | 
				
			||||||
 | 
					    (s2n,s2n+1,s2n+2,...s3n-1), ..."""
 | 
				
			||||||
 | 
					    return zip(*[iter(iterable)]*n)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_hyperdia_data(start_station, end_station, hour, minute, day="15",
 | 
				
			||||||
 | 
					                      month="08", year="2020", via=None):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    session = requests.Session()
 | 
				
			||||||
 | 
					    post_params = HYPERDIA_PARAMS.copy()
 | 
				
			||||||
 | 
					    headers = HEADERS.copy()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    post_params["dep_node"] = start_station
 | 
				
			||||||
 | 
					    post_params["arv_node"] = end_station
 | 
				
			||||||
 | 
					    post_params["year"] = year
 | 
				
			||||||
 | 
					    post_params["day"] = day
 | 
				
			||||||
 | 
					    post_params["month"] = month
 | 
				
			||||||
 | 
					    post_params["hour"] = hour
 | 
				
			||||||
 | 
					    post_params["minute"] = minute
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if via is None:
 | 
				
			||||||
 | 
					        for element in ("via_node01", "via_node02", "via_node03"):
 | 
				
			||||||
 | 
					            post_params[element] = ""
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if len(via) > 3:
 | 
				
			||||||
 | 
					            raise ValueError("Only up to three through stations are allowed")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for node, station in zip_longest(
 | 
				
			||||||
 | 
					            via,
 | 
				
			||||||
 | 
					            ("via_node01", "via_node02", "via_node03"),
 | 
				
			||||||
 | 
					            fill_value=""):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            post_params[node] = station
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    referer = list(urlparse(HYPERDIA_SEARCH))
 | 
				
			||||||
 | 
					    referer[4] = urlencode(post_params)
 | 
				
			||||||
 | 
					    referer = urlunparse(referer)
 | 
				
			||||||
 | 
					    headers["Referer"] = referer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    session.headers.update(headers)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    result = session.post(HYPERDIA_CGI, data=post_params)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#TODO: Adjust this, use the Firefox inspector
 | 
				
			||||||
 | 
					# For now, keep this in mind:
 | 
				
			||||||
 | 
					# Odd rows per result: stations
 | 
				
			||||||
 | 
					# Even rows: Train names, transfers...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def parse_hyperdia_heading(soup, fare_number=1):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    data = dict()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    mapping = {1: "total_time", 2: "transfer_num", 3: "total_distance"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    counter = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for element in soup.find_all("span", class_="text_blue"):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if counter > 3:
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        data[mapping[counter]] = element.text
 | 
				
			||||||
 | 
					        counter += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    fare = soup.find("span", {"class": "text_blue",
 | 
				
			||||||
 | 
					                              "id": f"fare_total{fare_number}"})
 | 
				
			||||||
 | 
					    fare = int(fare.text.replace(",", ""))
 | 
				
			||||||
 | 
					    data["total_fare"] = fare
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def parse_station_time(element, year, month, day, start=True):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    times = list(element.stripped_strings)
 | 
				
			||||||
 | 
					    # The first element if it's a transfer (arrival time; we ignore walking)
 | 
				
			||||||
 | 
					    # Otherwise we get the only item
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    current_time = times[-1] if start else times[0]
 | 
				
			||||||
 | 
					    station_time = datetime(year, month, day, int(current_time.split(":")[0]),
 | 
				
			||||||
 | 
					                            int(current_time.split(":")[1]),
 | 
				
			||||||
 | 
					                            tzinfo=pytz.timezone("Japan"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return station_time
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def parse_train_name(element):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Trains are in a list with only one element, inside a span
 | 
				
			||||||
 | 
					    selected_item = element.select("td > ul > li > span")[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return list(selected_item.stripped_strings)[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def parse_hyperdia_table(soup, year, month, day):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    data = list()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    previous = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Skip the heading and the row immediately afterwards (commuter pass)
 | 
				
			||||||
 | 
					    for group in mlt.windowed(soup.find_all("tr")[2:], n=3, step=2):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Groups of 3 elements:
 | 
				
			||||||
 | 
					        # First row: start station (time in first column, station in column 3)
 | 
				
			||||||
 | 
					        # Second row: train information (duration in column 1, name in column 3)
 | 
				
			||||||
 | 
					        # Third row: arrival time(s) (same format as first row)
 | 
				
			||||||
 | 
					        # Times might be repeated more than once if it's a transfer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        start_info, journey_info, end_info = group
 | 
				
			||||||
 | 
					        startdata = start_info.find_all("td")[0:3]
 | 
				
			||||||
 | 
					        traindata = journey_info.find_all("td")[2]
 | 
				
			||||||
 | 
					        enddata = end_info.find_all("td")[0:3]
 | 
				
			||||||
 | 
					        # Ignore "add to favorities"
 | 
				
			||||||
 | 
					        start_station_name = list(startdata[2].stripped_strings)[0]
 | 
				
			||||||
 | 
					        start_station_time = parse_station_time(startdata[0], year, month, day,
 | 
				
			||||||
 | 
					                                                start=True)
 | 
				
			||||||
 | 
					        train_name = parse_train_name(traindata)
 | 
				
			||||||
 | 
					        end_station_name = list(enddata[2].stripped_strings)[0]
 | 
				
			||||||
 | 
					        end_station_time = parse_station_time(enddata[0], year, month, day,
 | 
				
			||||||
 | 
					                                              start=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        is_transfer = True if train_name == "Walk" else False
 | 
				
			||||||
 | 
					        duration = ((end_station_time - start_station_time).seconds / 60) % 60
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        entry = HyperdiaStep(
 | 
				
			||||||
 | 
					            start_station=start_station_name,
 | 
				
			||||||
 | 
					            end_station=end_station_name,
 | 
				
			||||||
 | 
					            start_time=start_station_time,
 | 
				
			||||||
 | 
					            end_time=end_station_time,
 | 
				
			||||||
 | 
					            train_name=train_name,
 | 
				
			||||||
 | 
					            is_transfer=is_transfer,
 | 
				
			||||||
 | 
					            duration=duration)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        data.append(entry)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def parse_hyperdia_html(soup):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    tables = soup.find_all("table", {"class": "table"})
 | 
				
			||||||
 | 
					    titles = soup.find_all("div", {"class": "title2"})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    results = list()
 | 
				
			||||||
 | 
					    for data in tables:
 | 
				
			||||||
 | 
					        properties = {}
 | 
				
			||||||
 | 
					        extracted = data.find_all(
 | 
				
			||||||
 | 
					            "span", {"class": ["text_16",
 | 
				
			||||||
 | 
					                               "text_blue_l", "text_blue_p"]})
 | 
				
			||||||
 | 
					        parsed = list(pairwise(extracted))
 | 
				
			||||||
 | 
					        start = parsed[0]
 | 
				
			||||||
 | 
					        end = parsed[-1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        properties["start"] = start[1].text
 | 
				
			||||||
 | 
					        properties["starttime"] = start[0].text
 | 
				
			||||||
 | 
					        properties["end"] = end[1].text
 | 
				
			||||||
 | 
					        properties["endtime"] = end[0].text.strip()
 | 
				
			||||||
 | 
					        results.append(properties)
 | 
				
			||||||
 | 
					    return results
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue