329 lines
12 KiB
Python
329 lines
12 KiB
Python
# encoding=utf-8
|
|
"""
|
|
Fixture scraper for Hong Kong Hockey Association website
|
|
Fetches upcoming HKFC C team fixtures from https://hockey.org.hk/MenFixture.asp
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
import re
|
|
|
|
|
|
class FixtureScraper:
|
|
"""Scrapes fixture data from Hong Kong Hockey Association website"""
|
|
|
|
FIXTURE_URL = "https://hockey.org.hk/MenFixture.asp"
|
|
TARGET_TEAM = "HKFC C"
|
|
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
})
|
|
|
|
def fetch_fixtures(self):
|
|
"""Fetch and parse fixtures from the website"""
|
|
try:
|
|
response = self.session.get(self.FIXTURE_URL, timeout=10)
|
|
response.raise_for_status()
|
|
return self._parse_fixtures(response.text)
|
|
except requests.RequestException as e:
|
|
print(f"Error fetching fixtures: {e}")
|
|
return []
|
|
|
|
def _parse_fixtures(self, html_content):
|
|
"""Parse HTML content and extract fixture information"""
|
|
soup = BeautifulSoup(html_content, 'lxml')
|
|
fixtures = []
|
|
|
|
# Find all table rows
|
|
rows = soup.find_all('tr')
|
|
current_date = None
|
|
|
|
for row in rows:
|
|
# Check if this row contains a date header
|
|
date_cells = row.find_all('td', colspan=True)
|
|
if date_cells:
|
|
date_text = date_cells[0].get_text(strip=True)
|
|
# Extract date from text like "Sunday, 7 Sep 2025"
|
|
date_match = re.search(r'(\w+day),\s+(\d+)\s+(\w+)\s+(\d{4})', date_text)
|
|
if date_match:
|
|
try:
|
|
day_name, day, month, year = date_match.groups()
|
|
date_str = f"{day} {month} {year}"
|
|
current_date = datetime.strptime(date_str, "%d %b %Y").date()
|
|
except ValueError:
|
|
continue
|
|
continue
|
|
|
|
# Check if this row contains fixture data
|
|
cells = row.find_all('td')
|
|
if len(cells) >= 5 and current_date:
|
|
try:
|
|
# Extract fixture details
|
|
# Note: The first cell might be empty or contain status (C/P)
|
|
# Column order: [Status/Division], Division, Time, Venue, Home, Away, [Umpire columns...]
|
|
|
|
# Handle tables with or without status column
|
|
if len(cells) >= 6:
|
|
# If 6+ columns, likely has status column first
|
|
status_or_div = cells[0].get_text(strip=True)
|
|
division = cells[1].get_text(strip=True) if cells[1] else ""
|
|
time = cells[2].get_text(strip=True) if cells[2] else ""
|
|
venue = cells[3].get_text(strip=True) if cells[3] else ""
|
|
home_team = cells[4].get_text(strip=True) if cells[4] else ""
|
|
away_team = cells[5].get_text(strip=True) if cells[5] else ""
|
|
else:
|
|
# If 5 columns, no status column
|
|
division = cells[0].get_text(strip=True) if cells[0] else ""
|
|
time = cells[1].get_text(strip=True) if cells[1] else ""
|
|
venue = cells[2].get_text(strip=True) if cells[2] else ""
|
|
home_team = cells[3].get_text(strip=True) if cells[3] else ""
|
|
away_team = cells[4].get_text(strip=True) if cells[4] else ""
|
|
|
|
# Check if HKFC C is playing in this match
|
|
if self.TARGET_TEAM in home_team or self.TARGET_TEAM in away_team:
|
|
# Determine opponent
|
|
if self.TARGET_TEAM in home_team:
|
|
opponent = away_team
|
|
is_home = True
|
|
else:
|
|
opponent = home_team
|
|
is_home = False
|
|
|
|
fixture = {
|
|
'date': current_date,
|
|
'time': time,
|
|
'venue': venue,
|
|
'opponent': opponent,
|
|
'is_home': is_home,
|
|
'home_team': home_team,
|
|
'away_team': away_team,
|
|
'division': division
|
|
}
|
|
fixtures.append(fixture)
|
|
except (IndexError, AttributeError) as e:
|
|
# Skip malformed rows
|
|
continue
|
|
|
|
return fixtures
|
|
|
|
def get_next_fixture(self):
|
|
"""Get the next upcoming HKFC C fixture"""
|
|
fixtures = self.fetch_fixtures()
|
|
|
|
if not fixtures:
|
|
return None
|
|
|
|
# Filter for future fixtures and sort by date
|
|
today = datetime.now().date()
|
|
future_fixtures = [f for f in fixtures if f['date'] >= today]
|
|
|
|
if not future_fixtures:
|
|
return None
|
|
|
|
# Sort by date and return the earliest
|
|
future_fixtures.sort(key=lambda x: x['date'])
|
|
return future_fixtures[0]
|
|
|
|
def get_all_future_fixtures(self, limit=10):
|
|
"""Get all future HKFC C fixtures, optionally limited"""
|
|
fixtures = self.fetch_fixtures()
|
|
|
|
if not fixtures:
|
|
return []
|
|
|
|
# Filter for future fixtures and sort by date
|
|
today = datetime.now().date()
|
|
future_fixtures = [f for f in fixtures if f['date'] >= today]
|
|
future_fixtures.sort(key=lambda x: x['date'])
|
|
|
|
return future_fixtures[:limit] if limit else future_fixtures
|
|
|
|
|
|
def get_next_hkfc_c_fixture():
|
|
"""Convenience function to get the next HKFC C fixture"""
|
|
scraper = FixtureScraper()
|
|
return scraper.get_next_fixture()
|
|
|
|
|
|
def get_opponent_club_name(opponent_team):
|
|
"""Extract club name from opponent team name (e.g., 'KCC B' -> 'KCC')"""
|
|
if not opponent_team:
|
|
return None
|
|
|
|
# Common patterns: "Club Letter" (e.g., "KCC B", "Valley A")
|
|
# Remove team letters and common suffixes
|
|
club_name = re.sub(r'\s+[A-H]$', '', opponent_team).strip()
|
|
|
|
return club_name
|
|
|
|
|
|
def match_opponent_to_club(opponent_team, clubs_database=None):
|
|
"""
|
|
Match an opponent team name to a club in the database
|
|
|
|
Args:
|
|
opponent_team (str): The opponent team name (e.g., "KCC B", "Valley A")
|
|
clubs_database (list): List of clubs from database, if None will fetch from DB
|
|
|
|
Returns:
|
|
dict: Club information if matched, None if no match found
|
|
"""
|
|
if not opponent_team:
|
|
return None
|
|
|
|
# Import here to avoid circular imports
|
|
try:
|
|
from db_config import sql_read
|
|
from sqlalchemy import text
|
|
except ImportError:
|
|
return None
|
|
|
|
# Get clubs from database if not provided
|
|
if clubs_database is None:
|
|
try:
|
|
clubs_result = sql_read(text("SELECT hockey_club FROM clubs ORDER BY hockey_club"))
|
|
clubs_database = [club['hockey_club'] for club in clubs_result] if clubs_result else []
|
|
except:
|
|
clubs_database = []
|
|
|
|
# Extract potential club name from opponent team
|
|
potential_club_names = []
|
|
|
|
# Method 1: Remove team letters (A, B, C, etc.)
|
|
base_name = re.sub(r'\s+[A-H]$', '', opponent_team).strip()
|
|
potential_club_names.append(base_name)
|
|
|
|
# Method 2: Remove common suffixes
|
|
suffixes_to_remove = [' A', ' B', ' C', ' D', ' E', ' F', ' G', ' H', ' I', ' J']
|
|
for suffix in suffixes_to_remove:
|
|
if opponent_team.endswith(suffix):
|
|
potential_club_names.append(opponent_team[:-len(suffix)].strip())
|
|
|
|
# Method 3: Split on spaces and try different combinations
|
|
words = opponent_team.split()
|
|
if len(words) > 1:
|
|
# Try first word only
|
|
potential_club_names.append(words[0])
|
|
# Try first two words
|
|
if len(words) > 2:
|
|
potential_club_names.append(' '.join(words[:2]))
|
|
|
|
# Try to match against database clubs
|
|
for potential_name in potential_club_names:
|
|
# Exact match
|
|
for club in clubs_database:
|
|
if club.lower() == potential_name.lower():
|
|
return {
|
|
'club_name': club,
|
|
'match_type': 'exact',
|
|
'confidence': 'high'
|
|
}
|
|
|
|
# Partial match (club name contains the potential name)
|
|
for club in clubs_database:
|
|
if potential_name.lower() in club.lower() or club.lower() in potential_name.lower():
|
|
return {
|
|
'club_name': club,
|
|
'match_type': 'partial',
|
|
'confidence': 'medium'
|
|
}
|
|
|
|
# If no match found, return the best guess
|
|
best_guess = potential_club_names[0] if potential_club_names else opponent_team
|
|
return {
|
|
'club_name': best_guess,
|
|
'match_type': 'guess',
|
|
'confidence': 'low'
|
|
}
|
|
|
|
|
|
def get_opponent_club_info(opponent_team):
|
|
"""
|
|
Get full club information for an opponent team
|
|
|
|
Args:
|
|
opponent_team (str): The opponent team name
|
|
|
|
Returns:
|
|
dict: Full club information including logo URL, or None if not found
|
|
"""
|
|
if not opponent_team:
|
|
return None
|
|
|
|
try:
|
|
from db_config import sql_read
|
|
from sqlalchemy import text
|
|
except ImportError:
|
|
return None
|
|
|
|
# First, try to match the opponent to a club
|
|
match_result = match_opponent_to_club(opponent_team)
|
|
|
|
if not match_result:
|
|
return None
|
|
|
|
club_name = match_result['club_name']
|
|
|
|
# Get full club information from database
|
|
try:
|
|
sql = text("SELECT id, hockey_club, logo_url FROM clubs WHERE hockey_club = :club_name")
|
|
club_info = sql_read(sql, {'club_name': club_name})
|
|
|
|
if club_info:
|
|
club_data = club_info[0]
|
|
return {
|
|
'id': club_data['id'],
|
|
'club_name': club_data['hockey_club'],
|
|
'logo_url': club_data['logo_url'],
|
|
'match_result': match_result
|
|
}
|
|
else:
|
|
# Club not found in database, return match result only
|
|
return {
|
|
'club_name': club_name,
|
|
'logo_url': None,
|
|
'match_result': match_result
|
|
}
|
|
except Exception as e:
|
|
print(f"Error getting club info: {e}")
|
|
return None
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Test the scraper
|
|
print("Testing Hong Kong Hockey Fixture Scraper...")
|
|
print("=" * 60)
|
|
|
|
scraper = FixtureScraper()
|
|
|
|
print("\nFetching next HKFC C fixture...")
|
|
next_fixture = scraper.get_next_fixture()
|
|
|
|
if next_fixture:
|
|
print(f"\nNext HKFC C Match:")
|
|
print(f" Date: {next_fixture['date'].strftime('%A, %d %B %Y')}")
|
|
print(f" Time: {next_fixture['time']}")
|
|
print(f" Venue: {next_fixture['venue']}")
|
|
print(f" Opponent: {next_fixture['opponent']}")
|
|
print(f" Home/Away: {'Home' if next_fixture['is_home'] else 'Away'}")
|
|
print(f" Division: {next_fixture['division']}")
|
|
|
|
club_name = get_opponent_club_name(next_fixture['opponent'])
|
|
print(f" Opponent Club: {club_name}")
|
|
else:
|
|
print("\nNo upcoming fixtures found.")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("\nFetching next 5 HKFC C fixtures...")
|
|
future_fixtures = scraper.get_all_future_fixtures(limit=5)
|
|
|
|
if future_fixtures:
|
|
for i, fixture in enumerate(future_fixtures, 1):
|
|
print(f"\n{i}. {fixture['date'].strftime('%d %b %Y')} vs {fixture['opponent']} ({fixture['venue']})")
|
|
else:
|
|
print("\nNo upcoming fixtures found.")
|
|
|