gcp-hockey-results/motm_app/fixture_scraper.py

329 lines
12 KiB
Python

# encoding=utf-8
"""
Fixture scraper for Hong Kong Hockey Association website
Fetches upcoming HKFC C team fixtures from https://hockey.org.hk/MenFixture.asp
"""
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
class FixtureScraper:
"""Scrapes fixture data from Hong Kong Hockey Association website"""
FIXTURE_URL = "https://hockey.org.hk/MenFixture.asp"
TARGET_TEAM = "HKFC C"
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
def fetch_fixtures(self):
"""Fetch and parse fixtures from the website"""
try:
response = self.session.get(self.FIXTURE_URL, timeout=10)
response.raise_for_status()
return self._parse_fixtures(response.text)
except requests.RequestException as e:
print(f"Error fetching fixtures: {e}")
return []
def _parse_fixtures(self, html_content):
"""Parse HTML content and extract fixture information"""
soup = BeautifulSoup(html_content, 'lxml')
fixtures = []
# Find all table rows
rows = soup.find_all('tr')
current_date = None
for row in rows:
# Check if this row contains a date header
date_cells = row.find_all('td', colspan=True)
if date_cells:
date_text = date_cells[0].get_text(strip=True)
# Extract date from text like "Sunday, 7 Sep 2025"
date_match = re.search(r'(\w+day),\s+(\d+)\s+(\w+)\s+(\d{4})', date_text)
if date_match:
try:
day_name, day, month, year = date_match.groups()
date_str = f"{day} {month} {year}"
current_date = datetime.strptime(date_str, "%d %b %Y").date()
except ValueError:
continue
continue
# Check if this row contains fixture data
cells = row.find_all('td')
if len(cells) >= 5 and current_date:
try:
# Extract fixture details
# Note: The first cell might be empty or contain status (C/P)
# Column order: [Status/Division], Division, Time, Venue, Home, Away, [Umpire columns...]
# Handle tables with or without status column
if len(cells) >= 6:
# If 6+ columns, likely has status column first
status_or_div = cells[0].get_text(strip=True)
division = cells[1].get_text(strip=True) if cells[1] else ""
time = cells[2].get_text(strip=True) if cells[2] else ""
venue = cells[3].get_text(strip=True) if cells[3] else ""
home_team = cells[4].get_text(strip=True) if cells[4] else ""
away_team = cells[5].get_text(strip=True) if cells[5] else ""
else:
# If 5 columns, no status column
division = cells[0].get_text(strip=True) if cells[0] else ""
time = cells[1].get_text(strip=True) if cells[1] else ""
venue = cells[2].get_text(strip=True) if cells[2] else ""
home_team = cells[3].get_text(strip=True) if cells[3] else ""
away_team = cells[4].get_text(strip=True) if cells[4] else ""
# Check if HKFC C is playing in this match
if self.TARGET_TEAM in home_team or self.TARGET_TEAM in away_team:
# Determine opponent
if self.TARGET_TEAM in home_team:
opponent = away_team
is_home = True
else:
opponent = home_team
is_home = False
fixture = {
'date': current_date,
'time': time,
'venue': venue,
'opponent': opponent,
'is_home': is_home,
'home_team': home_team,
'away_team': away_team,
'division': division
}
fixtures.append(fixture)
except (IndexError, AttributeError) as e:
# Skip malformed rows
continue
return fixtures
def get_next_fixture(self):
"""Get the next upcoming HKFC C fixture"""
fixtures = self.fetch_fixtures()
if not fixtures:
return None
# Filter for future fixtures and sort by date
today = datetime.now().date()
future_fixtures = [f for f in fixtures if f['date'] >= today]
if not future_fixtures:
return None
# Sort by date and return the earliest
future_fixtures.sort(key=lambda x: x['date'])
return future_fixtures[0]
def get_all_future_fixtures(self, limit=10):
"""Get all future HKFC C fixtures, optionally limited"""
fixtures = self.fetch_fixtures()
if not fixtures:
return []
# Filter for future fixtures and sort by date
today = datetime.now().date()
future_fixtures = [f for f in fixtures if f['date'] >= today]
future_fixtures.sort(key=lambda x: x['date'])
return future_fixtures[:limit] if limit else future_fixtures
def get_next_hkfc_c_fixture():
"""Convenience function to get the next HKFC C fixture"""
scraper = FixtureScraper()
return scraper.get_next_fixture()
def get_opponent_club_name(opponent_team):
"""Extract club name from opponent team name (e.g., 'KCC B' -> 'KCC')"""
if not opponent_team:
return None
# Common patterns: "Club Letter" (e.g., "KCC B", "Valley A")
# Remove team letters and common suffixes
club_name = re.sub(r'\s+[A-H]$', '', opponent_team).strip()
return club_name
def match_opponent_to_club(opponent_team, clubs_database=None):
"""
Match an opponent team name to a club in the database
Args:
opponent_team (str): The opponent team name (e.g., "KCC B", "Valley A")
clubs_database (list): List of clubs from database, if None will fetch from DB
Returns:
dict: Club information if matched, None if no match found
"""
if not opponent_team:
return None
# Import here to avoid circular imports
try:
from db_config import sql_read
from sqlalchemy import text
except ImportError:
return None
# Get clubs from database if not provided
if clubs_database is None:
try:
clubs_result = sql_read(text("SELECT hockey_club FROM clubs ORDER BY hockey_club"))
clubs_database = [club['hockey_club'] for club in clubs_result] if clubs_result else []
except:
clubs_database = []
# Extract potential club name from opponent team
potential_club_names = []
# Method 1: Remove team letters (A, B, C, etc.)
base_name = re.sub(r'\s+[A-H]$', '', opponent_team).strip()
potential_club_names.append(base_name)
# Method 2: Remove common suffixes
suffixes_to_remove = [' A', ' B', ' C', ' D', ' E', ' F', ' G', ' H', ' I', ' J']
for suffix in suffixes_to_remove:
if opponent_team.endswith(suffix):
potential_club_names.append(opponent_team[:-len(suffix)].strip())
# Method 3: Split on spaces and try different combinations
words = opponent_team.split()
if len(words) > 1:
# Try first word only
potential_club_names.append(words[0])
# Try first two words
if len(words) > 2:
potential_club_names.append(' '.join(words[:2]))
# Try to match against database clubs
for potential_name in potential_club_names:
# Exact match
for club in clubs_database:
if club.lower() == potential_name.lower():
return {
'club_name': club,
'match_type': 'exact',
'confidence': 'high'
}
# Partial match (club name contains the potential name)
for club in clubs_database:
if potential_name.lower() in club.lower() or club.lower() in potential_name.lower():
return {
'club_name': club,
'match_type': 'partial',
'confidence': 'medium'
}
# If no match found, return the best guess
best_guess = potential_club_names[0] if potential_club_names else opponent_team
return {
'club_name': best_guess,
'match_type': 'guess',
'confidence': 'low'
}
def get_opponent_club_info(opponent_team):
"""
Get full club information for an opponent team
Args:
opponent_team (str): The opponent team name
Returns:
dict: Full club information including logo URL, or None if not found
"""
if not opponent_team:
return None
try:
from db_config import sql_read
from sqlalchemy import text
except ImportError:
return None
# First, try to match the opponent to a club
match_result = match_opponent_to_club(opponent_team)
if not match_result:
return None
club_name = match_result['club_name']
# Get full club information from database
try:
sql = text("SELECT id, hockey_club, logo_url FROM clubs WHERE hockey_club = :club_name")
club_info = sql_read(sql, {'club_name': club_name})
if club_info:
club_data = club_info[0]
return {
'id': club_data['id'],
'club_name': club_data['hockey_club'],
'logo_url': club_data['logo_url'],
'match_result': match_result
}
else:
# Club not found in database, return match result only
return {
'club_name': club_name,
'logo_url': None,
'match_result': match_result
}
except Exception as e:
print(f"Error getting club info: {e}")
return None
if __name__ == "__main__":
# Test the scraper
print("Testing Hong Kong Hockey Fixture Scraper...")
print("=" * 60)
scraper = FixtureScraper()
print("\nFetching next HKFC C fixture...")
next_fixture = scraper.get_next_fixture()
if next_fixture:
print(f"\nNext HKFC C Match:")
print(f" Date: {next_fixture['date'].strftime('%A, %d %B %Y')}")
print(f" Time: {next_fixture['time']}")
print(f" Venue: {next_fixture['venue']}")
print(f" Opponent: {next_fixture['opponent']}")
print(f" Home/Away: {'Home' if next_fixture['is_home'] else 'Away'}")
print(f" Division: {next_fixture['division']}")
club_name = get_opponent_club_name(next_fixture['opponent'])
print(f" Opponent Club: {club_name}")
else:
print("\nNo upcoming fixtures found.")
print("\n" + "=" * 60)
print("\nFetching next 5 HKFC C fixtures...")
future_fixtures = scraper.get_all_future_fixtures(limit=5)
if future_fixtures:
for i, fixture in enumerate(future_fixtures, 1):
print(f"\n{i}. {fixture['date'].strftime('%d %b %Y')} vs {fixture['opponent']} ({fixture['venue']})")
else:
print("\nNo upcoming fixtures found.")