diff --git a/course/analytics.py b/course/analytics.py index aeb2ffe06..c31e820c6 100644 --- a/course/analytics.py +++ b/course/analytics.py @@ -25,7 +25,9 @@ import json import operator +from collections import defaultdict from dataclasses import dataclass +from datetime import timedelta from typing import TYPE_CHECKING, Any, ClassVar, final from django import http @@ -33,7 +35,19 @@ from django.contrib.auth.decorators import login_required from django.core.exceptions import ObjectDoesNotExist, PermissionDenied from django.db import connection -from django.db.models import FloatField, OuterRef, Subquery +from django.db.models import ( + Avg, + Count, + DateTimeField, + DurationField, + ExpressionWrapper, + F, + FloatField, + Max, + Min, + OuterRef, + Subquery, +) from django.urls import reverse from django.utils.translation import gettext as _, pgettext from pytools import not_none @@ -552,6 +566,145 @@ def count_participants(pctx: CoursePageContext, flow_id: str): return qset.count() +# {{{ page timing stats + +@dataclass +class PageTimingStats: + group_id: str + page_id: str + count: int + avg_time: float | None # in minutes + min_time: float | None # in minutes + max_time: float | None # in minutes + stddev_time: float | None # in minutes + + +def make_page_timing_stats_list( + pctx: CoursePageContext, + flow_id: str, + include_all_sessions: bool) -> list[PageTimingStats]: + """Return per-page timing statistics sorted by average time (descending). + + For each submitted-answer visit, this function searches for the most recent + preceding page view (``is_submitted_answer`` is NULL) in the same session + and uses the elapsed time as "time spent on the page". Summary statistics + (count, min, max, mean, sample standard deviation) are returned in minutes. + + When *include_all_sessions* is ``False``, only sessions belonging to + participants with the ``included_in_grade_statistics`` permission are + considered. When ``True``, all sessions are included (useful for seeing + where TAs spend time during test runs). + """ + + # Correlated sub-query: visit_time of the most recent preceding + # non-answer visit to the same page in the same session. + preceding_time_sq = Subquery( + FlowPageVisit.objects.filter( + page_data_id=OuterRef("page_data_id"), + flow_session_id=OuterRef("flow_session_id"), + visit_time__lt=OuterRef("visit_time"), + is_submitted_answer__isnull=True, + ) + .order_by("-visit_time") + .values("visit_time")[:1], + output_field=DateTimeField(), + ) + + visits_qs = FlowPageVisit.objects.filter( + flow_session__course=pctx.course, + flow_session__flow_id=flow_id, + is_submitted_answer=True, + ) + + if not include_all_sessions: + visits_qs = visits_qs.filter( + flow_session__participation__roles__permissions__permission=( + PPerm.included_in_grade_statistics) + ) + + # Annotate each submitted-answer visit with its elapsed duration. + # Negative durations (clock-skew or data anomalies where the answer visit + # was recorded before the page view) are excluded at the database level so + # that the aggregated values and the per-row durations fetched for stddev + # are computed from the same filtered set. + # Count/Avg/Min/Max are pushed to the database. StdDev is not universally + # supported across database backends (notably absent from SQLite), so sample + # standard deviation is computed in Python from a second fetch of the + # per-page duration values. This is a known two-query approach for stddev. + annotated_qs = (visits_qs + .annotate(preceding_visit_time=preceding_time_sq) + .filter(preceding_visit_time__isnull=False) + .annotate( + duration=ExpressionWrapper( + F("visit_time") - F("preceding_visit_time"), + output_field=DurationField(), + ) + ) + .filter(duration__gte=timedelta(0)) + ) + + page_agg = (annotated_qs + .values("page_data__group_id", "page_data__page_id") + .annotate( + count=Count("id"), + avg_duration=Avg("duration"), + min_duration=Min("duration"), + max_duration=Max("duration"), + ) + ) + + # Collect individual durations per page for the stddev calculation. + page_durations: dict[tuple[str, str], list[float]] = defaultdict(list) + for row in (annotated_qs + .values_list("page_data__group_id", "page_data__page_id", "duration")): + page_durations[row[0], row[1]].append(row[2].total_seconds() / 60) + + # Build result list. + result: list[PageTimingStats] = [] + for agg in page_agg: + group_id = agg["page_data__group_id"] + page_id = agg["page_data__page_id"] + count = agg["count"] + avg_time = ( + agg["avg_duration"].total_seconds() / 60 + if agg["avg_duration"] is not None else None + ) + min_time = ( + agg["min_duration"].total_seconds() / 60 + if agg["min_duration"] is not None else None + ) + max_time = ( + agg["max_duration"].total_seconds() / 60 + if agg["max_duration"] is not None else None + ) + + # Sample stddev (requires at least two observations). + # avg_time and page_durations are derived from the same filtered set, so + # they are consistent. + stddev_time: float | None = None + times = page_durations.get((group_id, page_id), []) + if len(times) >= 2 and avg_time is not None: + variance = sum((t - avg_time) ** 2 for t in times) / (len(times) - 1) + stddev_time = variance ** 0.5 + + result.append(PageTimingStats( + group_id=group_id, + page_id=page_id, + count=count, + avg_time=avg_time, + min_time=min_time, + max_time=max_time, + stddev_time=stddev_time, + )) + + # Sort by average time descending so the most time-consuming pages appear + # first. + result.sort(key=lambda s: s.avg_time or 0, reverse=True) + return result + +# }}} + + @login_required @course_view def flow_analytics(pctx: CoursePageContext, flow_id: str): @@ -561,6 +714,9 @@ def flow_analytics(pctx: CoursePageContext, flow_id: str): restrict_to_first_attempt = bool( bool(pctx.request.GET.get("restrict_to_first_attempt") == "1")) + timing_include_all_sessions = ( + pctx.request.GET.get("timing_include_all_sessions") == "1") + try: stats_list = make_page_answer_stats_list(pctx, flow_id, restrict_to_first_attempt) @@ -571,6 +727,9 @@ def flow_analytics(pctx: CoursePageContext, flow_id: str): % flow_id) raise http.Http404() + page_timing_stats_list = make_page_timing_stats_list( + pctx, flow_id, timing_include_all_sessions) + return render_course_page(pctx, "course/analytics-flow.html", { "flow_identifier": flow_id, "grade_histogram": make_grade_histogram(pctx, flow_id), @@ -578,6 +737,8 @@ def flow_analytics(pctx: CoursePageContext, flow_id: str): "time_histogram": make_time_histogram(pctx, flow_id), "participant_count": count_participants(pctx, flow_id), "restrict_to_first_attempt": restrict_to_first_attempt, + "page_timing_stats_list": page_timing_stats_list, + "timing_include_all_sessions": timing_include_all_sessions, }) # }}} diff --git a/course/templates/course/analytics-flow.html b/course/templates/course/analytics-flow.html index 2b61822db..cad55f333 100644 --- a/course/templates/course/analytics-flow.html +++ b/course/templates/course/analytics-flow.html @@ -24,12 +24,12 @@
+ {% if timing_include_all_sessions %} + {% trans "Showing timing for all sessions." %} + + {% trans "Show only grade-statistics sessions" %} + + {% else %} + {% trans "Showing timing for grade-statistics sessions only." %} + + {% trans "Show all sessions (including TAs etc.)" %} + + {% endif %} +
+ + {% if page_timing_stats_list %} +| {% trans "Page" %} | +{% trans "Count" %} | +{% trans "Min (min)" %} | +{% trans "Max (min)" %} | +{% trans "Mean (min)" %} | +{% trans "Std Dev (min)" %} | +
|---|---|---|---|---|---|
| {{ tstats.group_id }}/{{ tstats.page_id }} | +{{ tstats.count }} | +{{ tstats.min_time|floatformat:1 }} | +{{ tstats.max_time|floatformat:1 }} | +{{ tstats.avg_time|floatformat:1 }} | +{{ tstats.stddev_time|floatformat:1 }} | +
{% trans "No timing data available." %}
+ {% endif %} {% endblock %} diff --git a/tests/test_analytics.py b/tests/test_analytics.py index b430cc1d1..dd71ffa37 100644 --- a/tests/test_analytics.py +++ b/tests/test_analytics.py @@ -382,7 +382,10 @@ def test_success_check_func_call(self): ) as mock_make_t_his, mock.patch( "course.analytics.count_participants", return_value=2, - ) as mock_count_particpt: + ) as mock_count_particpt, mock.patch( + "course.analytics.make_page_timing_stats_list", + return_value=[], + ) as mock_make_timing: resp = self.get_flow_analytics_view(flow_id=self.flow_id) self.assertEqual(resp.status_code, 200) self.assertResponseContextEqual( @@ -394,6 +397,7 @@ def test_success_check_func_call(self): self.assertEqual(mock_make_stats_list.call_count, 1) self.assertEqual(mock_make_t_his.call_count, 1) self.assertEqual(mock_count_particpt.call_count, 1) + self.assertEqual(mock_make_timing.call_count, 1) def test_success_test_restrict_to_first_attempt(self): with mock.patch( @@ -407,7 +411,10 @@ def test_success_test_restrict_to_first_attempt(self): ) as mock_make_t_his, mock.patch( "course.analytics.count_participants", return_value=2, - ) as mock_count_particpt: + ) as mock_count_particpt, mock.patch( + "course.analytics.make_page_timing_stats_list", + return_value=[], + ) as mock_make_timing: resp = self.get_flow_analytics_view(flow_id=self.flow_id, restrict_to_first_attempt=1) self.assertEqual(resp.status_code, 200) @@ -424,6 +431,7 @@ def test_success_test_restrict_to_first_attempt(self): self.assertEqual(mock_make_stats_list.call_count, 1) self.assertEqual(mock_make_t_his.call_count, 1) self.assertEqual(mock_count_particpt.call_count, 1) + self.assertEqual(mock_make_timing.call_count, 1) def test_success_test_restrict_to_first_attempt_invalid(self): with mock.patch( @@ -437,7 +445,10 @@ def test_success_test_restrict_to_first_attempt_invalid(self): ) as mock_make_t_his, mock.patch( "course.analytics.count_participants", return_value=2, - ) as mock_count_particpt: + ) as mock_count_particpt, mock.patch( + "course.analytics.make_page_timing_stats_list", + return_value=[], + ) as mock_make_timing: resp = self.get_flow_analytics_view(flow_id=self.flow_id, restrict_to_first_attempt="foo") self.assertEqual(resp.status_code, 200) @@ -453,6 +464,7 @@ def test_success_test_restrict_to_first_attempt_invalid(self): self.assertEqual(mock_make_stats_list.call_count, 1) self.assertEqual(mock_make_t_his.call_count, 1) self.assertEqual(mock_count_particpt.call_count, 1) + self.assertEqual(mock_make_timing.call_count, 1) def test_flow_id_does_not_exist(self): with mock.patch( @@ -465,4 +477,157 @@ def test_flow_id_does_not_exist(self): f"Flow '{self.flow_id}' was not found in the repository, but it exists in " # noqa: E501 "the database--maybe it was deleted?") + +@pytest.mark.slow +class MakePageTimingStatsListTest(SingleCourseTestMixin, TestCase): + """Test analytics.make_page_timing_stats_list.""" + + FLOW_ID = "test-timing-flow" + + def _make_pctx(self): + pctx = mock.MagicMock() + pctx.course = self.course + return pctx + + def _make_visits(self, participation, group_id, page_id, + view_time, answer_time): + """Create a (view, answer) pair of FlowPageVisit records.""" + from datetime import timedelta + + from django.utils.timezone import now + + base = now() + session = factories.FlowSessionFactory( + participation=participation, + flow_id=self.FLOW_ID, + ) + page_data = factories.FlowPageDataFactory( + flow_session=session, + group_id=group_id, + page_id=page_id, + ) + # Non-answer (page view) visit + factories.FlowPageVisitFactory( + page_data=page_data, + flow_session=session, + visit_time=base + timedelta(minutes=view_time), + is_submitted_answer=None, + ) + # Submitted-answer visit + factories.FlowPageVisitFactory( + page_data=page_data, + flow_session=session, + visit_time=base + timedelta(minutes=answer_time), + is_submitted_answer=True, + ) + return session + + def test_empty_when_no_visits(self): + pctx = self._make_pctx() + result = analytics.make_page_timing_stats_list( + pctx, self.FLOW_ID, include_all_sessions=True) + self.assertEqual(result, []) + + def test_single_visit_no_preceding_view(self): + """A submitted-answer visit without a preceding view yields no stats.""" + pctx = self._make_pctx() + session = factories.FlowSessionFactory( + participation=self.student_participation, + flow_id=self.FLOW_ID, + ) + page_data = factories.FlowPageDataFactory( + flow_session=session, + group_id="grp", + page_id="pg", + ) + factories.FlowPageVisitFactory( + page_data=page_data, + flow_session=session, + is_submitted_answer=True, + ) + result = analytics.make_page_timing_stats_list( + pctx, self.FLOW_ID, include_all_sessions=True) + self.assertEqual(result, []) + + def test_basic_timing_stats(self): + """Stats are computed correctly for a single page with known time.""" + pctx = self._make_pctx() + # Student views the page at t=0 min and submits at t=5 min → 5 min elapsed. + self._make_visits(self.student_participation, + "grp", "pg", view_time=0, answer_time=5) + result = analytics.make_page_timing_stats_list( + pctx, self.FLOW_ID, include_all_sessions=True) + self.assertEqual(len(result), 1) + stats = result[0] + self.assertEqual(stats.group_id, "grp") + self.assertEqual(stats.page_id, "pg") + self.assertEqual(stats.count, 1) + self.assertAlmostEqual(stats.avg_time, 5.0, places=3) # type: ignore[arg-type] + self.assertAlmostEqual(stats.min_time, 5.0, places=3) # type: ignore[arg-type] + self.assertAlmostEqual(stats.max_time, 5.0, places=3) # type: ignore[arg-type] + self.assertIsNone(stats.stddev_time) # sample stddev undefined for n=1 + + def test_multiple_visits_stats(self): + """Mean, min, max, and stddev are correct for multiple visits.""" + pctx = self._make_pctx() + # Three students spend 2, 4, 6 minutes respectively + for mins in [2, 4, 6]: + p = factories.ParticipationFactory(course=self.course) + self._make_visits(p, "grp", "pg", + view_time=0, answer_time=mins) + result = analytics.make_page_timing_stats_list( + pctx, self.FLOW_ID, include_all_sessions=True) + self.assertEqual(len(result), 1) + stats = result[0] + self.assertEqual(stats.count, 3) + self.assertAlmostEqual(stats.avg_time, 4.0, places=3) # type: ignore[arg-type] + self.assertAlmostEqual(stats.min_time, 2.0, places=3) # type: ignore[arg-type] + self.assertAlmostEqual(stats.max_time, 6.0, places=3) # type: ignore[arg-type] + # Sample std dev of [2,4,6]: variance = ((4+0+4)/2) = 4, stddev = 2 + self.assertAlmostEqual(stats.stddev_time, 2.0, places=3) # type: ignore[arg-type] + + def test_sorted_by_avg_time_descending(self): + """Results are sorted most-time-consuming first.""" + pctx = self._make_pctx() + # Page A: 1 minute, Page B: 10 minutes + p1 = factories.ParticipationFactory(course=self.course) + p2 = factories.ParticipationFactory(course=self.course) + self._make_visits(p1, "grp", "page_a", view_time=0, answer_time=1) + self._make_visits(p2, "grp", "page_b", view_time=0, answer_time=10) + result = analytics.make_page_timing_stats_list( + pctx, self.FLOW_ID, include_all_sessions=True) + self.assertEqual(len(result), 2) + self.assertEqual(result[0].page_id, "page_b") + self.assertEqual(result[1].page_id, "page_a") + + def test_include_all_sessions_vs_grade_stats_only(self): + """The include_all_sessions flag filters correctly. + + student_participation has ``included_in_grade_statistics``. + ta_participation does NOT (the "ta" role is excluded from grade stats). + """ + pctx = self._make_pctx() + + # student_participation has included_in_grade_statistics permission + self._make_visits(self.student_participation, + "grp", "pg", view_time=0, answer_time=5) + + # ta_participation does NOT have included_in_grade_statistics. + self._make_visits(self.ta_participation, + "grp", "pg", view_time=0, answer_time=3) + + result_all = analytics.make_page_timing_stats_list( + pctx, self.FLOW_ID, include_all_sessions=True) + result_graded = analytics.make_page_timing_stats_list( + pctx, self.FLOW_ID, include_all_sessions=False) + + # With include_all_sessions=True both sessions are counted. + self.assertEqual(len(result_all), 1) + self.assertEqual(result_all[0].count, 2) + + # With include_all_sessions=False only the student session is counted. + self.assertEqual(len(result_graded), 1) + self.assertEqual(result_graded[0].count, 1) + self.assertAlmostEqual(result_graded[0].avg_time, 5.0, places=3) # type: ignore[arg-type] + # vim: fdm=marker