# encoding=utf-8 """ Fixture scraper for Hong Kong Hockey Association website Fetches upcoming HKFC C team fixtures from https://hockey.org.hk/MenFixture.asp """ import requests from bs4 import BeautifulSoup from datetime import datetime import re class FixtureScraper: """Scrapes fixture data from Hong Kong Hockey Association website""" FIXTURE_URL = "https://hockey.org.hk/MenFixture.asp" TARGET_TEAM = "HKFC C" def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) def fetch_fixtures(self): """Fetch and parse fixtures from the website""" try: response = self.session.get(self.FIXTURE_URL, timeout=10) response.raise_for_status() return self._parse_fixtures(response.text) except requests.RequestException as e: print(f"Error fetching fixtures: {e}") return [] def _parse_fixtures(self, html_content): """Parse HTML content and extract fixture information""" soup = BeautifulSoup(html_content, 'lxml') fixtures = [] # Find all table rows rows = soup.find_all('tr') current_date = None for row in rows: # Check if this row contains a date header date_cells = row.find_all('td', colspan=True) if date_cells: date_text = date_cells[0].get_text(strip=True) # Extract date from text like "Sunday, 7 Sep 2025" date_match = re.search(r'(\w+day),\s+(\d+)\s+(\w+)\s+(\d{4})', date_text) if date_match: try: day_name, day, month, year = date_match.groups() date_str = f"{day} {month} {year}" current_date = datetime.strptime(date_str, "%d %b %Y").date() except ValueError: continue continue # Check if this row contains fixture data cells = row.find_all('td') if len(cells) >= 5 and current_date: try: # Extract fixture details # Note: The first cell might be empty or contain status (C/P) # Column order: [Status/Division], Division, Time, Venue, Home, Away, [Umpire columns...] # Handle tables with or without status column if len(cells) >= 6: # If 6+ columns, likely has status column first status_or_div = cells[0].get_text(strip=True) division = cells[1].get_text(strip=True) if cells[1] else "" time = cells[2].get_text(strip=True) if cells[2] else "" venue = cells[3].get_text(strip=True) if cells[3] else "" home_team = cells[4].get_text(strip=True) if cells[4] else "" away_team = cells[5].get_text(strip=True) if cells[5] else "" else: # If 5 columns, no status column division = cells[0].get_text(strip=True) if cells[0] else "" time = cells[1].get_text(strip=True) if cells[1] else "" venue = cells[2].get_text(strip=True) if cells[2] else "" home_team = cells[3].get_text(strip=True) if cells[3] else "" away_team = cells[4].get_text(strip=True) if cells[4] else "" # Check if HKFC C is playing in this match if self.TARGET_TEAM in home_team or self.TARGET_TEAM in away_team: # Determine opponent if self.TARGET_TEAM in home_team: opponent = away_team is_home = True else: opponent = home_team is_home = False fixture = { 'date': current_date, 'time': time, 'venue': venue, 'opponent': opponent, 'is_home': is_home, 'home_team': home_team, 'away_team': away_team, 'division': division } fixtures.append(fixture) except (IndexError, AttributeError) as e: # Skip malformed rows continue return fixtures def get_next_fixture(self): """Get the next upcoming HKFC C fixture""" fixtures = self.fetch_fixtures() if not fixtures: return None # Filter for future fixtures and sort by date today = datetime.now().date() future_fixtures = [f for f in fixtures if f['date'] >= today] if not future_fixtures: return None # Sort by date and return the earliest future_fixtures.sort(key=lambda x: x['date']) return future_fixtures[0] def get_all_future_fixtures(self, limit=10): """Get all future HKFC C fixtures, optionally limited""" fixtures = self.fetch_fixtures() if not fixtures: return [] # Filter for future fixtures and sort by date today = datetime.now().date() future_fixtures = [f for f in fixtures if f['date'] >= today] future_fixtures.sort(key=lambda x: x['date']) return future_fixtures[:limit] if limit else future_fixtures def get_next_hkfc_c_fixture(): """Convenience function to get the next HKFC C fixture""" scraper = FixtureScraper() return scraper.get_next_fixture() def get_opponent_club_name(opponent_team): """Extract club name from opponent team name (e.g., 'KCC B' -> 'KCC')""" if not opponent_team: return None # Common patterns: "Club Letter" (e.g., "KCC B", "Valley A") # Remove team letters and common suffixes club_name = re.sub(r'\s+[A-H]$', '', opponent_team).strip() return club_name def match_opponent_to_club(opponent_team, clubs_database=None): """ Match an opponent team name to a club in the database Args: opponent_team (str): The opponent team name (e.g., "KCC B", "Valley A") clubs_database (list): List of clubs from database, if None will fetch from DB Returns: dict: Club information if matched, None if no match found """ if not opponent_team: return None # Import here to avoid circular imports try: from db_config import sql_read from sqlalchemy import text except ImportError: return None # Get clubs from database if not provided if clubs_database is None: try: clubs_result = sql_read(text("SELECT hockey_club FROM clubs ORDER BY hockey_club")) clubs_database = [club['hockey_club'] for club in clubs_result] if clubs_result else [] except: clubs_database = [] # Extract potential club name from opponent team potential_club_names = [] # Method 1: Remove team letters (A, B, C, etc.) base_name = re.sub(r'\s+[A-H]$', '', opponent_team).strip() potential_club_names.append(base_name) # Method 2: Remove common suffixes suffixes_to_remove = [' A', ' B', ' C', ' D', ' E', ' F', ' G', ' H', ' I', ' J'] for suffix in suffixes_to_remove: if opponent_team.endswith(suffix): potential_club_names.append(opponent_team[:-len(suffix)].strip()) # Method 3: Split on spaces and try different combinations words = opponent_team.split() if len(words) > 1: # Try first word only potential_club_names.append(words[0]) # Try first two words if len(words) > 2: potential_club_names.append(' '.join(words[:2])) # Try to match against database clubs for potential_name in potential_club_names: # Exact match for club in clubs_database: if club.lower() == potential_name.lower(): return { 'club_name': club, 'match_type': 'exact', 'confidence': 'high' } # Partial match (club name contains the potential name) for club in clubs_database: if potential_name.lower() in club.lower() or club.lower() in potential_name.lower(): return { 'club_name': club, 'match_type': 'partial', 'confidence': 'medium' } # If no match found, return the best guess best_guess = potential_club_names[0] if potential_club_names else opponent_team return { 'club_name': best_guess, 'match_type': 'guess', 'confidence': 'low' } def get_opponent_club_info(opponent_team): """ Get full club information for an opponent team Args: opponent_team (str): The opponent team name Returns: dict: Full club information including logo URL, or None if not found """ if not opponent_team: return None try: from db_config import sql_read from sqlalchemy import text except ImportError: return None # First, try to match the opponent to a club match_result = match_opponent_to_club(opponent_team) if not match_result: return None club_name = match_result['club_name'] # Get full club information from database try: sql = text("SELECT id, hockey_club, logo_url FROM clubs WHERE hockey_club = :club_name") club_info = sql_read(sql, {'club_name': club_name}) if club_info: club_data = club_info[0] return { 'id': club_data['id'], 'club_name': club_data['hockey_club'], 'logo_url': club_data['logo_url'], 'match_result': match_result } else: # Club not found in database, return match result only return { 'club_name': club_name, 'logo_url': None, 'match_result': match_result } except Exception as e: print(f"Error getting club info: {e}") return None if __name__ == "__main__": # Test the scraper print("Testing Hong Kong Hockey Fixture Scraper...") print("=" * 60) scraper = FixtureScraper() print("\nFetching next HKFC C fixture...") next_fixture = scraper.get_next_fixture() if next_fixture: print(f"\nNext HKFC C Match:") print(f" Date: {next_fixture['date'].strftime('%A, %d %B %Y')}") print(f" Time: {next_fixture['time']}") print(f" Venue: {next_fixture['venue']}") print(f" Opponent: {next_fixture['opponent']}") print(f" Home/Away: {'Home' if next_fixture['is_home'] else 'Away'}") print(f" Division: {next_fixture['division']}") club_name = get_opponent_club_name(next_fixture['opponent']) print(f" Opponent Club: {club_name}") else: print("\nNo upcoming fixtures found.") print("\n" + "=" * 60) print("\nFetching next 5 HKFC C fixtures...") future_fixtures = scraper.get_all_future_fixtures(limit=5) if future_fixtures: for i, fixture in enumerate(future_fixtures, 1): print(f"\n{i}. {fixture['date'].strftime('%d %b %Y')} vs {fixture['opponent']} ({fixture['venue']})") else: print("\nNo upcoming fixtures found.")