Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 173 additions & 2 deletions mlb_form_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,31 @@
logger = logging.getLogger("propiq.form")


# ── Postgres helpers (reads from live_batting_logs / live_pitching_logs) ──────
def _get_pg():
"""Return a psycopg2 connection or None."""
import os
try:
import psycopg2
db_url = os.environ.get("DATABASE_URL", "")
if not db_url:
return None
return psycopg2.connect(db_url)
except Exception:
return None
Comment on lines +45 to +55
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The _get_pg helper creates a new database connection every time it is called. Since prefetch_form_data iterates over a set of players and calls multiple DB-fetching methods per player, this implementation will open and close hundreds of connections in a single dispatch run. This is highly inefficient and risks exhausting the database connection pool or hitting server-side connection limits. Consider using a connection pool (e.g., psycopg2.pool.SimpleConnectionPool) or opening a single connection at the start of the pre-fetch loop and passing it down.


# Mapping: API stat key → (table_col_expr_for_batting, table_col_expr_for_pitching)
# None means not available in that table.
_DB_STAT_MAP = {
"hits": ("(h_1b + h_2b + h_3b + home_runs)", None),
"rbi": ("b_rbi", None),
"runs": ("b_runs", None),
"totalBases": ("(h_1b + 2*h_2b + 3*h_3b + 4*home_runs)", None),
"strikeOuts": ("b_k", "strikeouts"),
"earnedRuns": (None, "earnedruns"),
}
Comment on lines +59 to +66
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The _DB_STAT_MAP dictionary is defined but never utilized in the subsequent code. Instead, the SQL expressions for calculating stats (e.g., h_1b + h_2b + h_3b + home_runs) are hardcoded directly in the fetcher methods (lines 253, 285, etc.). This violates the DRY principle and makes the code harder to maintain if the schema or calculation logic changes.



# ── Trend significance gates (from sequencebaseball/cogs/trends.py) ───────────
try:
from fix_trend_significance import gate_form_adjustment, is_significant_trend
Expand Down Expand Up @@ -165,6 +190,143 @@
# API fetchers
# ------------------------------------------------------------------


# ------------------------------------------------------------------
# DB-primary fetchers — read from nightly game_logs_refresh tables
# ------------------------------------------------------------------

@staticmethod
def _fetch_game_log_from_db(
player_id: int, group: str, window: int
) -> list[dict] | None:
"""
Query live_batting_logs / live_pitching_logs for recent game rows.
Returns list of synthetic "split" dicts (same shape _compute_form expects)
or None if the table is empty / player not found.
"""
conn = _get_pg()
if conn is None:
return None
table = "live_batting_logs" if group == "hitting" else "live_pitching_logs"
try:
with conn, conn.cursor() as cur:
cur.execute(

Check failure on line 213 in mlb_form_layer.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

mlb_form_layer.py#L213

Avoiding SQL string concatenation: untrusted input concatenated with raw SQL query can result in SQL Injection.

Check warning on line 213 in mlb_form_layer.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

mlb_form_layer.py#L213

Expression "cur.execute(f'\n SELECT game_date,\n h_1b, h_2b, h_3b, home_runs,\n b_rbi, b_runs, b_k,\n COALESCE(strikeouts, 0) AS strikeouts,\n COALESCE(earnedruns, 0) AS earnedruns,\n COALESCE(outs, 0) AS outs_pitched\n FROM {table}\n WHERE mlbam_id = %s\n ORDER BY game_date DESC\n LIMIT %s\n ', (player_id, window)) if table == 'live_batting_logs' else cur.execute(f'\n SELECT game_date,\n 0 AS h_1b, 0 AS h_2b, 0 AS h_3b, 0 AS home_runs,\n 0 AS b_rbi, 0 AS b_runs, 0 AS b_k,\n COALESCE(strikeouts, 0) AS strikeouts,\n COALESCE(earnedruns, 0) AS earnedruns,\n COALESCE(outs, 0) AS outs_pitched\n FROM {table}\n WHERE mlbam_id = %s\n ORDER BY game_date DESC\n LIMIT %s\n ', (player_id, window))" is assigned to nothing
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Expression "cur.execute(f'\n SELECT game_date,\n h_1b, h_2b, h_3b, home_runs,\n b_rbi, b_runs, b_k,\n COALESCE(strikeouts, 0) AS strikeouts,\n COALESCE(earnedruns, 0) AS earnedruns,\n COALESCE(outs, 0) AS outs_pitched\n FROM {table}\n WHERE mlbam_id = %s\n ORDER BY game_date DESC\n LIMIT %s\n ', (player_id, window)) if table == 'live_batting_logs' else cur.execute(f'\n SELECT game_date,\n 0 AS h_1b, 0 AS h_2b, 0 AS h_3b, 0 AS home_runs,\n 0 AS b_rbi, 0 AS b_runs, 0 AS b_k,\n COALESCE(strikeouts, 0) AS strikeouts,\n COALESCE(earnedruns, 0) AS earnedruns,\n COALESCE(outs, 0) AS outs_pitched\n FROM {table}\n WHERE mlbam_id = %s\n ORDER BY game_date DESC\n LIMIT %s\n ', (player_id, window))" is assigned to nothing


An expression that is not a function call is assigned to nothing. Probably something else was intended here. We recommend to review this.

f"""

Check warning on line 214 in mlb_form_layer.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

mlb_form_layer.py#L214

Possible SQL injection vector through string-based query construction.
SELECT game_date,
h_1b, h_2b, h_3b, home_runs,
b_rbi, b_runs, b_k,
COALESCE(strikeouts, 0) AS strikeouts,
COALESCE(earnedruns, 0) AS earnedruns,
COALESCE(outs, 0) AS outs_pitched
Comment on lines +218 to +220
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1: The batting DB query references non-existent columns (strikeouts, earnedruns, outs) in live_batting_logs, so hitter lookups fail and always fall back to the live API.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At mlb_form_layer.py, line 218:

<comment>The batting DB query references non-existent columns (`strikeouts`, `earnedruns`, `outs`) in `live_batting_logs`, so hitter lookups fail and always fall back to the live API.</comment>

<file context>
@@ -165,6 +190,143 @@ def _resolve_player_id(self, player_name: str) -> int | None:
+                    SELECT game_date,
+                           h_1b, h_2b, h_3b, home_runs,
+                           b_rbi, b_runs, b_k,
+                           COALESCE(strikeouts, 0)   AS strikeouts,
+                           COALESCE(earnedruns, 0)   AS earnedruns,
+                           COALESCE(outs, 0)         AS outs_pitched
</file context>
Suggested change
COALESCE(strikeouts, 0) AS strikeouts,
COALESCE(earnedruns, 0) AS earnedruns,
COALESCE(outs, 0) AS outs_pitched
0 AS strikeouts,
0 AS earnedruns,
0 AS outs_pitched

FROM {table}
WHERE mlbam_id = %s
ORDER BY game_date DESC
LIMIT %s
""",
(player_id, window),
) if table == "live_batting_logs" else cur.execute(

Check failure on line 227 in mlb_form_layer.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

mlb_form_layer.py#L227

Avoiding SQL string concatenation: untrusted input concatenated with raw SQL query can result in SQL Injection.
f"""
SELECT game_date,
0 AS h_1b, 0 AS h_2b, 0 AS h_3b, 0 AS home_runs,
0 AS b_rbi, 0 AS b_runs, 0 AS b_k,
COALESCE(strikeouts, 0) AS strikeouts,
COALESCE(earnedruns, 0) AS earnedruns,
COALESCE(outs, 0) AS outs_pitched
FROM {table}
WHERE mlbam_id = %s
ORDER BY game_date DESC
LIMIT %s
""",
(player_id, window),
)
Comment on lines +213 to +241
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The use of a conditional expression to execute different cur.execute calls is non-idiomatic and reduces readability. It is better to use a standard if/else block to define the query string and then call execute once at the end.

rows = cur.fetchall()
except Exception as exc:
logger.debug("[Form][DB] game log query failed: %s", exc)
return None
finally:
conn.close()

if not rows:
return None

def _row_to_stat(r) -> dict:
hits = (r[1] or 0) + (r[2] or 0) + (r[3] or 0) + (r[4] or 0)
tb = (r[1] or 0) + 2*(r[2] or 0) + 3*(r[3] or 0) + 4*(r[4] or 0)
return {
"hits": hits,
"rbi": r[5] or 0,
"runs": r[6] or 0,
"totalBases": tb,
"strikeOuts": r[7] if group == "hitting" else r[8],
"earnedRuns": r[9] or 0,
}
Comment on lines +252 to +262
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The _row_to_stat helper relies on hardcoded column indices (e.g., r[1], r[5]). This is fragile and will break silently if the SELECT statement in the query is ever modified or reordered. Using psycopg2.extras.RealDictCursor to access columns by name would be much more robust.


# Return synthetic split dicts
return [{"stat": _row_to_stat(r)} for r in rows]

@staticmethod
def _fetch_season_per_game_from_db(
player_id: int, group: str
) -> dict[str, float] | None:
"""
Compute season-average-per-game from DB for baseline comparison.
Returns {api_stat_key: per_game_avg} or None if insufficient data.
"""
conn = _get_pg()
if conn is None:
return None
table = "live_batting_logs" if group == "hitting" else "live_pitching_logs"
try:
with conn, conn.cursor() as cur:
if table == "live_batting_logs":
cur.execute(
"""
SELECT COUNT(*),
SUM(h_1b+h_2b+h_3b+home_runs),
SUM(b_rbi), SUM(b_runs),
SUM(h_1b + 2*h_2b + 3*h_3b + 4*home_runs),
SUM(b_k)
FROM live_batting_logs
WHERE mlbam_id = %s AND game_date >= '2026-03-01'
""",
(player_id,),
)
else:
cur.execute(
"""
SELECT COUNT(*),
0, 0, 0, 0,
SUM(strikeouts), SUM(earnedruns)
FROM live_pitching_logs
WHERE mlbam_id = %s AND game_date >= '2026-03-01'
Comment on lines +290 to +301
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

There is a logic discrepancy between the DB fetcher and the API fetcher for the season baseline. The API fetcher (_fetch_season_per_game, line 376) uses the prior year as the baseline, which aligns with the module's documentation (line 6). However, the DB fetcher uses a hardcoded date '2026-03-01', which calculates the average from the current season. This inconsistency means the form adjustment will change significantly depending on whether the data is retrieved from the database or the API fallback.

Comment on lines +290 to +301
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Hardcoded season date '2026-03-01' will break in future seasons.

The season filter is hardcoded to 2026-03-01, which won't adapt when the season changes. The API fallback at line 376 correctly uses datetime.now().year - 1 to determine the baseline season. Apply similar dynamic logic here.

🐛 Proposed fix: derive season start dynamically

At the start of the method, compute the season start date:

from datetime import datetime

# Inside _fetch_season_per_game_from_db, before the query:
current_year = datetime.now().year
season_start = f"{current_year}-03-01"

Then use season_start as a parameter:

                     cur.execute(
                         """
                         SELECT COUNT(*),
                                SUM(h_1b+h_2b+h_3b+home_runs),
                                SUM(b_rbi), SUM(b_runs),
                                SUM(h_1b + 2*h_2b + 3*h_3b + 4*home_runs),
                                SUM(b_k)
                         FROM live_batting_logs
-                        WHERE mlbam_id = %s AND game_date >= '2026-03-01'
+                        WHERE mlbam_id = %s AND game_date >= %s
                         """,
-                        (player_id,),
+                        (player_id, season_start),
                     )

Apply the same change to the pitching query.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@mlb_form_layer.py` around lines 290 - 301, The queries in
_fetch_season_per_game_from_db currently hardcode '2026-03-01'; compute a
dynamic season_start (e.g. season_start = f"{datetime.now().year}-03-01" after
importing datetime) at the top of the function and replace the literal
'2026-03-01' in both the batting and pitching SQL branches with a parameterized
placeholder, passing season_start in the execute parameter tuple (replace
(player_id,) with (player_id, season_start) or similar) so the season filter
adapts each year.

""",
(player_id,),
)
row = cur.fetchone()
except Exception as exc:
logger.debug("[Form][DB] season avg query failed: %s", exc)
return None
finally:
conn.close()

if not row or not row[0] or row[0] < 3:
return None

gp = row[0]
if group == "hitting":
return {
"hits": (row[1] or 0) / gp,
"rbi": (row[2] or 0) / gp,
"runs": (row[3] or 0) / gp,
"totalBases": (row[4] or 0) / gp,
"strikeOuts": (row[5] or 0) / gp,
}
else:
return {
"strikeOuts": (row[5] or 0) / gp,
"earnedRuns": (row[6] or 0) / gp,
}

@staticmethod
def _fetch_game_log(
player_id: int, group: str, window: int
Expand Down Expand Up @@ -257,8 +419,17 @@

for group in groups_needed:
window = _PITCHER_WINDOW if group == "pitching" else _HITTER_WINDOW
recent_splits = self._fetch_game_log(player_id, group, window)
season_pg = self._fetch_season_per_game(player_id, group)

# DB-first: read from nightly game_logs_refresh tables (fast, no API quota)
recent_splits = self._fetch_game_log_from_db(player_id, group, window)
season_pg = self._fetch_season_per_game_from_db(player_id, group)

# API fallback if DB tables are empty / player not yet seeded
if recent_splits is None:
logger.debug("[Form] DB miss for pid=%d group=%s — falling back to live API", player_id, group)
recent_splits = self._fetch_game_log(player_id, group, window)
if season_pg is None:
season_pg = self._fetch_season_per_game(player_id, group)

if not recent_splits or not season_pg:
continue
Expand Down