Source code for dancesport_parser.o2cm.parser
import datetime
import re
from os import path
from typing import List
from bs4 import BeautifulSoup
from dancesport_parser.util import parseHtml
from dancesport_parser.o2cm.model import Competition
RESULTS_MAIN_DOMAIN = "https://results.o2cm.com/"
[docs]
def parseMain(htmlDOM: BeautifulSoup = None, rawHtml: str = None) -> List[Competition]:
"""Parse o2cm main results screen, i.e. results.o2cm.com.
:param htmlDOM: HTML contents to parse.
:param rawHtml: If `htmlDOM` is unspecified, raw string to directly parse.
:return: List of Competitions parsed from HTML
"""
if htmlDOM is None and rawHtml is None:
raise RuntimeError("Expected either htmlDOM or rawHtml to be provided.")
if htmlDOM is None:
htmlDOM = parseHtml(rawHtml)
results: List[Competition] = []
competitionsTable = htmlDOM.find_all("table", id="main_tbl")[0]
yearInput = htmlDOM.find_all('input', id='inyear')[0]
year = int(yearInput['value'])
for row in competitionsTable.find_all("tr"):
rowData = row.find_all("td")
if rowData is None or len(rowData) == 0:
continue
if "class" in rowData[0].attrs and rowData[0]["class"][0] == "h3":
year = int(rowData[0].get_text().strip())
elif "class" in row.attrs and row["class"][0] == "t1n":
date = str(rowData[0].get_text().strip())
compName = rowData[1].get_text().strip()
compUrl = row.find("a")["href"]
matchCompUrl = re.match(r'event[23].asp\?event=([a-zA-Z]{0,4}\d{0,5}[a-zA-Z]?)&.*', compUrl)
compId = matchCompUrl.group(1).lower()
fullDate = date + " " + str(year)
compDate = datetime.datetime.strptime(fullDate, "%b %d %Y").date()
competition = Competition(compId, compName, compDate, path.join(RESULTS_MAIN_DOMAIN, compUrl))
results.append(competition)
return results