From 7c45d69d98a48fa3248baeef48962a7d32389ae4 Mon Sep 17 00:00:00 2001 From: lucasbrentano Date: Wed, 8 Apr 2026 19:29:18 -0300 Subject: [PATCH 1/2] =?UTF-8?q?refactor(annotate):=20alterar=20unidade=20d?= =?UTF-8?q?e=20anota=C3=A7=C3=A3o=20de=20coment=C3=A1rio=20para=20usu?= =?UTF-8?q?=C3=A1rio=20(#87)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A classificação agora é por autor (DatasetEntry), não por comentário. Comentários servem apenas como evidências para fundamentar a decisão. - Model: Annotation e AnnotationConflict usam dataset_entry_id (FK → dataset_entries) - Schemas: campos renomeados (entry_id, total_users, annotated_users_by_me) - Services: annotate, review, dashboard e data reescritos para granularidade por entry - Routers: endpoints mantidos, contratos atualizados - Migration: Alembic 0015 altera FKs (destrutiva — limpa anotações existentes) - Testes: 340 testes passando Co-Authored-By: Claude --- ...8_0015_annotation_unit_comment_to_entry.py | 115 +++++ backend/models/annotation.py | 12 +- backend/routers/annotate.py | 4 +- backend/routers/dashboard.py | 8 +- backend/routers/review.py | 4 +- backend/schemas/annotate.py | 27 +- backend/schemas/dashboard.py | 19 +- backend/schemas/data.py | 9 +- backend/schemas/review.py | 24 +- backend/services/annotate.py | 397 +++++++-------- backend/services/dashboard.py | 441 +++++++--------- backend/services/data.py | 167 ++----- backend/services/review.py | 471 ++++++++---------- backend/services/seed.py | 45 +- backend/tests/test_annotate.py | 292 ++++------- backend/tests/test_dashboard.py | 264 +++++----- backend/tests/test_data.py | 111 ++--- backend/tests/test_review.py | 352 +++++-------- backend/tests/test_seed.py | 13 +- 19 files changed, 1228 insertions(+), 1547 deletions(-) create mode 100644 backend/alembic/versions/20260408_0015_annotation_unit_comment_to_entry.py diff --git a/backend/alembic/versions/20260408_0015_annotation_unit_comment_to_entry.py b/backend/alembic/versions/20260408_0015_annotation_unit_comment_to_entry.py new file mode 100644 index 0000000..12abf6e --- /dev/null +++ b/backend/alembic/versions/20260408_0015_annotation_unit_comment_to_entry.py @@ -0,0 +1,115 @@ +"""alterar unidade de anotação de comentário para dataset_entry (usuário) + +Revision ID: 0015 +Revises: 0014 +Create Date: 2026-04-08 00:00:00.000000 + +Migração destrutiva: remove todas as anotações, conflitos e resoluções +existentes antes de alterar as FKs. Necessário porque a mudança de +granularidade (comentário → usuário) invalida os dados anteriores. +""" + +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +revision: str = "0015" +down_revision: str | None = "0014" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + # 1) Limpar dados existentes (mudança de granularidade os invalida) + op.execute("DELETE FROM resolutions") + op.execute("DELETE FROM annotation_conflicts") + op.execute("DELETE FROM annotations") + + # 2) annotations: comment_id → dataset_entry_id + op.drop_constraint("uq_comment_annotator", "annotations", type_="unique") + op.drop_constraint("annotations_comment_id_fkey", "annotations", type_="foreignkey") + op.drop_column("annotations", "comment_id") + + op.add_column( + "annotations", + sa.Column( + "dataset_entry_id", + sa.Uuid(), + sa.ForeignKey("dataset_entries.id"), + nullable=False, + ), + ) + op.create_unique_constraint( + "uq_entry_annotator", + "annotations", + ["dataset_entry_id", "annotator_id"], + ) + + # 3) annotation_conflicts: comment_id → dataset_entry_id + op.drop_constraint( + "annotation_conflicts_comment_id_key", + "annotation_conflicts", + type_="unique", + ) + op.drop_constraint( + "annotation_conflicts_comment_id_fkey", + "annotation_conflicts", + type_="foreignkey", + ) + op.drop_column("annotation_conflicts", "comment_id") + + op.add_column( + "annotation_conflicts", + sa.Column( + "dataset_entry_id", + sa.Uuid(), + sa.ForeignKey("dataset_entries.id"), + nullable=False, + unique=True, + ), + ) + + +def downgrade() -> None: + # Reverter: dataset_entry_id → comment_id + op.execute("DELETE FROM resolutions") + op.execute("DELETE FROM annotation_conflicts") + op.execute("DELETE FROM annotations") + + # annotation_conflicts + op.drop_constraint( + "annotation_conflicts_dataset_entry_id_key", + "annotation_conflicts", + type_="unique", + ) + op.drop_column("annotation_conflicts", "dataset_entry_id") + op.add_column( + "annotation_conflicts", + sa.Column( + "comment_id", + sa.Uuid(), + sa.ForeignKey("comments.id"), + nullable=False, + unique=True, + ), + ) + + # annotations + op.drop_constraint("uq_entry_annotator", "annotations", type_="unique") + op.drop_column("annotations", "dataset_entry_id") + op.add_column( + "annotations", + sa.Column( + "comment_id", + sa.Uuid(), + sa.ForeignKey("comments.id"), + nullable=False, + ), + ) + op.create_unique_constraint( + "uq_comment_annotator", + "annotations", + ["comment_id", "annotator_id"], + ) diff --git a/backend/models/annotation.py b/backend/models/annotation.py index c9839d0..e339425 100644 --- a/backend/models/annotation.py +++ b/backend/models/annotation.py @@ -11,7 +11,9 @@ class Annotation(Base): __tablename__ = "annotations" id: Mapped[uuid.UUID] = mapped_column(primary_key=True, default=uuid.uuid4) - comment_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("comments.id")) + dataset_entry_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("dataset_entries.id") + ) annotator_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("users.id")) label: Mapped[str] = mapped_column(String(8), nullable=False) justificativa: Mapped[str | None] = mapped_column(Text, nullable=True) @@ -20,11 +22,11 @@ class Annotation(Base): default=datetime.utcnow, onupdate=datetime.utcnow ) - comment: Mapped["Comment"] = relationship() # noqa: F821 + dataset_entry: Mapped["DatasetEntry"] = relationship() # noqa: F821 annotator: Mapped["User"] = relationship() # noqa: F821 __table_args__ = ( - UniqueConstraint("comment_id", "annotator_id", name="uq_comment_annotator"), + UniqueConstraint("dataset_entry_id", "annotator_id", name="uq_entry_annotator"), CheckConstraint("label IN ('bot', 'humano')", name="ck_valid_label"), ) @@ -33,8 +35,8 @@ class AnnotationConflict(Base): __tablename__ = "annotation_conflicts" id: Mapped[uuid.UUID] = mapped_column(primary_key=True, default=uuid.uuid4) - comment_id: Mapped[uuid.UUID] = mapped_column( - ForeignKey("comments.id"), unique=True + dataset_entry_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("dataset_entries.id"), unique=True ) annotation_a_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("annotations.id")) annotation_b_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("annotations.id")) diff --git a/backend/routers/annotate.py b/backend/routers/annotate.py index 2b1e497..7c87efc 100644 --- a/backend/routers/annotate.py +++ b/backend/routers/annotate.py @@ -73,7 +73,7 @@ def get_comments_endpoint( ) -# ─── Anotar comentário ────────────────────────────────────────────────────── +# ─── Anotar usuário (entry) ─────────────────────────────────────────────── @router.post("", response_model=AnnotationResult) @@ -91,7 +91,7 @@ def annotate_endpoint( result = upsert_annotation( db, - payload.comment_db_id, + payload.entry_id, current_user.id, payload.label, payload.justificativa, diff --git a/backend/routers/dashboard.py b/backend/routers/dashboard.py index b225b28..6044b71 100644 --- a/backend/routers/dashboard.py +++ b/backend/routers/dashboard.py @@ -6,7 +6,7 @@ from database import get_db from models.user import User from schemas.dashboard import ( - BotCommentsResponse, + BotUsersResponse, CriteriaEffectivenessItem, GlobalDashboardResponse, UserDashboardResponse, @@ -14,7 +14,7 @@ ) from services.auth import get_current_user from services.dashboard import ( - get_bot_comments, + get_bot_users, get_criteria_effectiveness, get_global_dashboard, get_user_dashboard, @@ -69,7 +69,7 @@ def user_endpoint( return get_user_dashboard(db, user_id=current_user.id) -@router.get("/bots", response_model=BotCommentsResponse) +@router.get("/bots", response_model=BotUsersResponse) def bots_endpoint( dataset_id: str | None = Query(default=None), video_id: str | None = Query(default=None), @@ -84,7 +84,7 @@ def bots_endpoint( criteria_list = ( [c.strip() for c in criteria.split(",") if c.strip()] if criteria else None ) - return get_bot_comments( + return get_bot_users( db, dataset_id=dataset_id, video_id=video_id, diff --git a/backend/routers/review.py b/backend/routers/review.py index 5b65483..f4525d8 100644 --- a/backend/routers/review.py +++ b/backend/routers/review.py @@ -157,7 +157,7 @@ def import_endpoint( db: Session = Depends(get_db), admin: User = Depends(require_admin), ): - return import_review(db, admin.id, payload.video_id, payload.comments) + return import_review(db, admin.id, payload.video_id, payload.users) @router.post("/import-chunk", response_model=ImportChunkResponse) @@ -166,5 +166,5 @@ def import_chunk_endpoint( db: Session = Depends(get_db), admin: User = Depends(require_admin), ): - result = import_review_chunk(db, admin.id, payload.comments, payload.done) + result = import_review_chunk(db, admin.id, payload.users, payload.done) return ImportChunkResponse(**result) diff --git a/backend/schemas/annotate.py b/backend/schemas/annotate.py index 53d9174..2dfff30 100644 --- a/backend/schemas/annotate.py +++ b/backend/schemas/annotate.py @@ -8,7 +8,7 @@ class AnnotationCreate(BaseModel): - comment_db_id: uuid.UUID + entry_id: uuid.UUID label: Literal["bot", "humano"] justificativa: str | None = None @@ -20,7 +20,7 @@ def justificativa_required_for_bot(self): class AnnotationImportItem(BaseModel): - comment_db_id: uuid.UUID + entry_id: uuid.UUID label: Literal["bot", "humano"] justificativa: str | None = None @@ -54,7 +54,7 @@ class ImportChunkResponse(BaseModel): class AnnotationResult(BaseModel): annotation_id: uuid.UUID - comment_db_id: uuid.UUID + entry_id: uuid.UUID label: str conflict_created: bool @@ -74,21 +74,23 @@ class AnnotatorAnnotation(BaseModel): annotated_at: datetime -class CommentWithAnnotation(BaseModel): +class CommentItem(BaseModel): + """Comentário exibido como evidência (sem anotação individual).""" + comment_db_id: uuid.UUID text_original: str like_count: int reply_count: int published_at: datetime - my_annotation: MyAnnotation | None - all_annotations: list[AnnotatorAnnotation] | None = None class UserCommentsResponse(BaseModel): entry_id: uuid.UUID author_display_name: str author_channel_id: str - comments: list[CommentWithAnnotation] + comments: list[CommentItem] + my_annotation: MyAnnotation | None + all_annotations: list[AnnotatorAnnotation] | None = None class UserItem(BaseModel): @@ -96,16 +98,15 @@ class UserItem(BaseModel): author_channel_id: str author_display_name: str comment_count: int - my_annotated_count: int - my_pending_count: int + is_annotated_by_me: bool + my_label: str | None = None class DatasetUsersResponse(BaseModel): dataset_id: uuid.UUID dataset_name: str total_users: int - total_comments: int - annotated_comments_by_me: int + annotated_users_by_me: int page: int page_size: int total_pages: int @@ -115,7 +116,7 @@ class DatasetUsersResponse(BaseModel): class DatasetProgress(BaseModel): dataset_id: uuid.UUID dataset_name: str - total_comments: int + total_users: int annotated: int bots: int humans: int @@ -134,7 +135,7 @@ class AnnotatorProgress(BaseModel): annotator_name: str dataset_id: uuid.UUID dataset_name: str - total_comments: int + total_users: int annotated: int bots: int humans: int diff --git a/backend/schemas/dashboard.py b/backend/schemas/dashboard.py index 92c72f7..aed76a0 100644 --- a/backend/schemas/dashboard.py +++ b/backend/schemas/dashboard.py @@ -9,8 +9,8 @@ class GlobalSummary(BaseModel): total_datasets: int - total_comments_annotated: int - total_comments_in_datasets: int + total_users_annotated: int + total_users_in_datasets: int annotation_progress: float total_bots: int total_humans: int @@ -37,7 +37,7 @@ class CriteriaEffectivenessItem(BaseModel): criteria: str group: str total_datasets: int - total_comments_selected: int + total_users_selected: int total_bots: int bot_rate: float @@ -47,7 +47,7 @@ class CriteriaEffectivenessItem(BaseModel): class VideoSummary(BaseModel): total_comments_collected: int - total_comments_in_datasets: int + total_users_in_datasets: int total_annotated: int total_bots: int total_humans: int @@ -91,7 +91,7 @@ class UserDatasetProgress(BaseModel): dataset_id: uuid.UUID dataset_name: str video_id: str - total_comments: int + total_users: int annotated_by_me: int pending: int percent_complete: float @@ -111,16 +111,17 @@ class UserDashboardResponse(BaseModel): # ── Tabela de Bots ───────────────────────────────────────────────── -class BotCommentItem(BaseModel): +class BotUserItem(BaseModel): dataset_name: str author_display_name: str - text_original: str + author_channel_id: str + comment_count: int concordance_pct: int conflict_status: str | None = None annotators_count: int = 0 criteria: list[str] = [] -class BotCommentsResponse(BaseModel): +class BotUsersResponse(BaseModel): total: int - items: list[BotCommentItem] + items: list[BotUserItem] diff --git a/backend/schemas/data.py b/backend/schemas/data.py index 1ebe3cc..b9564a6 100644 --- a/backend/schemas/data.py +++ b/backend/schemas/data.py @@ -47,11 +47,10 @@ class DataDataset(BaseModel): class DataAnnotationProgress(BaseModel): dataset_id: uuid.UUID dataset_name: str - total: int - annotated: int - pending: int + total_users: int + annotated_users: int + pending_users: int conflicts: int conflicts_resolved: int annotators_count: int - bots_users: int - bots_comments: int + bots: int diff --git a/backend/schemas/review.py b/backend/schemas/review.py index 78f1cc6..3676369 100644 --- a/backend/schemas/review.py +++ b/backend/schemas/review.py @@ -12,11 +12,10 @@ class ResolveRequest(BaseModel): resolved_label: Literal["bot", "humano"] -class ReviewImportComment(BaseModel): - comment_db_id: uuid.UUID +class ReviewImportUser(BaseModel): + entry_id: uuid.UUID author_channel_id: str author_display_name: str - text_original: str final_label: Literal["bot", "humano"] annotations: list[dict] = [] resolution: dict | None = None @@ -27,14 +26,14 @@ class ReviewImport(BaseModel): dataset_name: str video_id: str - comments: list[ReviewImportComment] = Field(min_length=1) + users: list[ReviewImportUser] = Field(min_length=1) done: bool = True class ReviewImportChunk(BaseModel): - """Batch adicional de comentários revisados para import paginado.""" + """Batch adicional de usuários revisados para import paginado.""" - comments: list[ReviewImportComment] = Field(min_length=1) + users: list[ReviewImportUser] = Field(min_length=1) done: bool = False @@ -43,11 +42,12 @@ class ReviewImportChunk(BaseModel): class ConflictListItem(BaseModel): conflict_id: uuid.UUID - comment_id: uuid.UUID + entry_id: uuid.UUID dataset_id: uuid.UUID dataset_name: str author_display_name: str - text_original: str + author_channel_id: str + comment_count: int label_a: str annotator_a: str justificativa_a: str | None @@ -101,11 +101,11 @@ class BotAnnotationDetail(BaseModel): justificativa: str | None -class BotCommentItem(BaseModel): - comment_db_id: uuid.UUID - text_original: str +class BotUserItem(BaseModel): + entry_id: uuid.UUID author_display_name: str author_channel_id: str + comment_count: int dataset_id: uuid.UUID dataset_name: str annotations: list[BotAnnotationDetail] @@ -126,7 +126,7 @@ class PaginatedBots(BaseModel): page: int page_size: int total_pages: int - items: list[BotCommentItem] + items: list[BotUserItem] class ReviewStats(BaseModel): diff --git a/backend/services/annotate.py b/backend/services/annotate.py index b1127e8..6f0e2d4 100644 --- a/backend/services/annotate.py +++ b/backend/services/annotate.py @@ -1,4 +1,8 @@ -"""Serviço da US-04 — anotação de comentários (bot/humano).""" +"""Serviço da US-04 — anotação de usuários do YouTube (bot/humano). + +Unidade de anotação: DatasetEntry (autor/canal do YouTube). +Comentários são evidências — a classificação é do autor, não do comentário. +""" import csv import io @@ -49,124 +53,124 @@ def list_dataset_users( .subquery() ) - # Subquery: annotated_count por autor + # Subquery: anotação do pesquisador para cada entry if is_admin: ann_sub = ( db.query( - Comment.author_channel_id, - func.count(func.distinct(Annotation.comment_id)).label("ac"), + Annotation.dataset_entry_id, + func.count(func.distinct(Annotation.annotator_id)).label("ac"), ) - .join(Annotation, Annotation.comment_id == Comment.id) - .filter(Comment.collection_id == collection_id) - .group_by(Comment.author_channel_id) + .group_by(Annotation.dataset_entry_id) .subquery() ) else: ann_sub = ( db.query( - Comment.author_channel_id, - func.count(Annotation.id).label("ac"), - ) - .join(Annotation, Annotation.comment_id == Comment.id) - .filter( - Comment.collection_id == collection_id, - Annotation.annotator_id == annotator_id, + Annotation.dataset_entry_id, + Annotation.label, ) - .group_by(Comment.author_channel_id) + .filter(Annotation.annotator_id == annotator_id) .subquery() ) - # Query base com LEFT JOINs para contagens cc_col = func.coalesce(comment_count_sub.c.cc, 0) - ac_col = func.coalesce(ann_sub.c.ac, 0) - pending_col = (cc_col - ac_col).label("pending") - - base_query = ( - db.query(DatasetEntry, cc_col.label("cc"), ac_col.label("ac"), pending_col) - .outerjoin( - comment_count_sub, - comment_count_sub.c.author_channel_id == DatasetEntry.author_channel_id, + + if is_admin: + ac_col = func.coalesce(ann_sub.c.ac, 0) + base_query = ( + db.query(DatasetEntry, cc_col.label("cc"), ac_col.label("ac")) + .outerjoin( + comment_count_sub, + comment_count_sub.c.author_channel_id == DatasetEntry.author_channel_id, + ) + .outerjoin(ann_sub, ann_sub.c.dataset_entry_id == DatasetEntry.id) + .filter(DatasetEntry.dataset_id == dataset_id) ) - .outerjoin( - ann_sub, - ann_sub.c.author_channel_id == DatasetEntry.author_channel_id, + else: + base_query = ( + db.query( + DatasetEntry, + cc_col.label("cc"), + ann_sub.c.label.label("my_label"), + ) + .outerjoin( + comment_count_sub, + comment_count_sub.c.author_channel_id == DatasetEntry.author_channel_id, + ) + .outerjoin(ann_sub, ann_sub.c.dataset_entry_id == DatasetEntry.id) + .filter(DatasetEntry.dataset_id == dataset_id) ) - .filter(DatasetEntry.dataset_id == dataset_id) - ) if only_pending: - base_query = base_query.filter(pending_col > 0) + if is_admin: + base_query = base_query.filter(ac_col == 0) + else: + base_query = base_query.filter(ann_sub.c.label.is_(None)) - # Total filtrado total_users = base_query.count() - # Ordenação if pending_first: - order = [pending_col.desc(), DatasetEntry.author_display_name] + if is_admin: + order = [ac_col.asc(), DatasetEntry.author_display_name] + else: + order = [ann_sub.c.label.asc(), DatasetEntry.author_display_name] else: order = [DatasetEntry.author_display_name] offset = (page - 1) * page_size rows = base_query.order_by(*order).offset(offset).limit(page_size).all() - # Totais globais (sem filtro only_pending) - all_author_ids_sub = ( - db.query(DatasetEntry.author_channel_id) - .filter(DatasetEntry.dataset_id == dataset_id) - .subquery() - ) - - total_comments = ( - db.query(func.count(Comment.id)) - .filter( - Comment.collection_id == collection_id, - Comment.author_channel_id.in_(all_author_ids_sub.select()), - ) - .scalar() - ) or 0 - + # Total global de anotados if is_admin: total_annotated = ( - db.query(func.count(func.distinct(Annotation.comment_id))) - .join(Comment, Annotation.comment_id == Comment.id) - .filter( - Comment.collection_id == collection_id, - Comment.author_channel_id.in_(all_author_ids_sub.select()), - ) + db.query(func.count(func.distinct(Annotation.dataset_entry_id))) + .join(DatasetEntry, Annotation.dataset_entry_id == DatasetEntry.id) + .filter(DatasetEntry.dataset_id == dataset_id) .scalar() ) or 0 else: total_annotated = ( db.query(func.count(Annotation.id)) - .join(Comment, Annotation.comment_id == Comment.id) + .join(DatasetEntry, Annotation.dataset_entry_id == DatasetEntry.id) .filter( - Comment.collection_id == collection_id, - Comment.author_channel_id.in_(all_author_ids_sub.select()), + DatasetEntry.dataset_id == dataset_id, Annotation.annotator_id == annotator_id, ) .scalar() ) or 0 - # Montar items a partir das rows (contagens já vêm da query) items = [] - for entry, cc, ac, _pending in rows: - items.append( - { - "entry_id": entry.id, - "author_channel_id": entry.author_channel_id, - "author_display_name": entry.author_display_name, - "comment_count": cc, - "my_annotated_count": ac, - "my_pending_count": cc - ac, - } - ) + for row in rows: + if is_admin: + entry, cc, ac = row + items.append( + { + "entry_id": entry.id, + "author_channel_id": entry.author_channel_id, + "author_display_name": entry.author_display_name, + "comment_count": cc, + "is_annotated_by_me": ac > 0, + "my_label": None, + } + ) + else: + entry, cc, my_label = row + items.append( + { + "entry_id": entry.id, + "author_channel_id": entry.author_channel_id, + "author_display_name": entry.author_display_name, + "comment_count": cc, + "is_annotated_by_me": my_label is not None, + "my_label": my_label, + } + ) return { "dataset_id": dataset.id, "dataset_name": dataset.name, "total_users": total_users, - "total_comments": total_comments, - "annotated_comments_by_me": total_annotated, + "annotated_users_by_me": total_annotated, "page": page, "page_size": page_size, "total_pages": _total_pages(total_users, page_size), @@ -178,7 +182,7 @@ def _total_pages(total: int, page_size: int) -> int: return max(1, (total + page_size - 1) // page_size) -# ─── Comentários de um usuário (entry) ────────────────────────────────────── +# ─── Comentários de um usuário (entry) — evidências ────────────────────── def get_entry_comments( @@ -206,60 +210,59 @@ def get_entry_comments( .all() ) - result_comments = [] - for c in comments: - # Anotação do pesquisador logado - my_ann = None - if not is_admin: - annotation = ( - db.query(Annotation) - .filter( - Annotation.comment_id == c.id, - Annotation.annotator_id == annotator_id, - ) - .first() - ) - if annotation: - my_ann = { - "label": annotation.label, - "justificativa": annotation.justificativa, - "annotated_at": annotation.annotated_at, - } + result_comments = [ + { + "comment_db_id": c.id, + "text_original": c.text_original, + "like_count": c.like_count, + "reply_count": c.reply_count, + "published_at": c.published_at, + } + for c in comments + ] - # Admin vê todas as anotações de todos os pesquisadores - all_anns = None - if is_admin: - annotations = ( - db.query(Annotation).filter(Annotation.comment_id == c.id).all() + # Anotação do pesquisador logado (por entry, não por comment) + my_ann = None + if not is_admin: + annotation = ( + db.query(Annotation) + .filter( + Annotation.dataset_entry_id == entry.id, + Annotation.annotator_id == annotator_id, ) - if annotations: - all_anns = [ - { - "annotator_name": a.annotator.name, - "label": a.label, - "justificativa": a.justificativa, - "annotated_at": a.annotated_at, - } - for a in annotations - ] - - result_comments.append( - { - "comment_db_id": c.id, - "text_original": c.text_original, - "like_count": c.like_count, - "reply_count": c.reply_count, - "published_at": c.published_at, - "my_annotation": my_ann, - "all_annotations": all_anns, + .first() + ) + if annotation: + my_ann = { + "label": annotation.label, + "justificativa": annotation.justificativa, + "annotated_at": annotation.annotated_at, } + + # Admin vê todas as anotações de todos os pesquisadores (por entry) + all_anns = None + if is_admin: + annotations = ( + db.query(Annotation).filter(Annotation.dataset_entry_id == entry.id).all() ) + if annotations: + all_anns = [ + { + "annotator_name": a.annotator.name, + "label": a.label, + "justificativa": a.justificativa, + "annotated_at": a.annotated_at, + } + for a in annotations + ] return { "entry_id": entry.id, "author_display_name": entry.author_display_name, "author_channel_id": entry.author_channel_id, "comments": result_comments, + "my_annotation": my_ann, + "all_annotations": all_anns, } @@ -268,22 +271,21 @@ def get_entry_comments( def upsert_annotation( db: Session, - comment_id: uuid.UUID, + entry_id: uuid.UUID, annotator_id: uuid.UUID, label: str, justificativa: str | None, ) -> dict: - # Verificar que o comentário existe - comment = db.query(Comment).filter(Comment.id == comment_id).first() - if comment is None: + entry = db.query(DatasetEntry).filter(DatasetEntry.id == entry_id).first() + if entry is None: raise HTTPException( - status.HTTP_404_NOT_FOUND, detail="Comentário não encontrado." + status.HTTP_404_NOT_FOUND, detail="Entrada de dataset não encontrada." ) # Upsert: cria ou atualiza existing = ( db.query(Annotation) - .filter_by(comment_id=comment_id, annotator_id=annotator_id) + .filter_by(dataset_entry_id=entry_id, annotator_id=annotator_id) .first() ) @@ -294,7 +296,7 @@ def upsert_annotation( annotation = existing else: annotation = Annotation( - comment_id=comment_id, + dataset_entry_id=entry_id, annotator_id=annotator_id, label=label, justificativa=justificativa, @@ -307,7 +309,7 @@ def upsert_annotation( other = ( db.query(Annotation) .filter( - Annotation.comment_id == comment_id, + Annotation.dataset_entry_id == entry_id, Annotation.annotator_id != annotator_id, ) .first() @@ -316,10 +318,12 @@ def upsert_annotation( conflict_created = False if other and other.label != label: - conflict = db.query(AnnotationConflict).filter_by(comment_id=comment_id).first() + conflict = ( + db.query(AnnotationConflict).filter_by(dataset_entry_id=entry_id).first() + ) if not conflict: conflict = AnnotationConflict( - comment_id=comment_id, + dataset_entry_id=entry_id, annotation_a_id=other.id, annotation_b_id=annotation.id, ) @@ -334,7 +338,9 @@ def upsert_annotation( conflict_created = True elif other and other.label == label: # Labels agora concordam — resolver conflito se existir - conflict = db.query(AnnotationConflict).filter_by(comment_id=comment_id).first() + conflict = ( + db.query(AnnotationConflict).filter_by(dataset_entry_id=entry_id).first() + ) if conflict and conflict.status == "pending": db.delete(conflict) @@ -342,7 +348,7 @@ def upsert_annotation( return { "annotation_id": annotation.id, - "comment_db_id": comment_id, + "entry_id": entry_id, "label": label, "conflict_created": conflict_created, } @@ -359,29 +365,21 @@ def get_my_progress( result = [] for ds in datasets: - # Total de comentários dos usuários selecionados neste dataset - total_comments = ( - db.query(func.count(Comment.id)) - .join( - DatasetEntry, - (DatasetEntry.author_channel_id == Comment.author_channel_id) - & (DatasetEntry.dataset_id == ds.id), - ) - .filter(Comment.collection_id == ds.collection_id) + total_users = ( + db.query(func.count(DatasetEntry.id)) + .filter(DatasetEntry.dataset_id == ds.id) .scalar() ) - # Minhas anotações para comentários neste dataset + if total_users == 0: + continue + + # Minhas anotações para entries neste dataset my_annotations = ( db.query(Annotation) - .join(Comment, Annotation.comment_id == Comment.id) - .join( - DatasetEntry, - (DatasetEntry.author_channel_id == Comment.author_channel_id) - & (DatasetEntry.dataset_id == ds.id), - ) + .join(DatasetEntry, Annotation.dataset_entry_id == DatasetEntry.id) .filter( - Comment.collection_id == ds.collection_id, + DatasetEntry.dataset_id == ds.id, Annotation.annotator_id == annotator_id, ) .all() @@ -391,18 +389,17 @@ def get_my_progress( bots = sum(1 for a in my_annotations if a.label == "bot") humans = sum(1 for a in my_annotations if a.label == "humano") - if total_comments > 0: - result.append( - { - "dataset_id": ds.id, - "dataset_name": ds.name, - "total_comments": total_comments, - "annotated": annotated, - "bots": bots, - "humans": humans, - "percent_complete": round(annotated / total_comments * 100, 1), - } - ) + result.append( + { + "dataset_id": ds.id, + "dataset_name": ds.name, + "total_users": total_users, + "annotated": annotated, + "bots": bots, + "humans": humans, + "percent_complete": round(annotated / total_users * 100, 1), + } + ) return result @@ -418,17 +415,12 @@ def get_all_progress(db: Session) -> list[dict]: result = [] for ds in datasets: - total_comments = ( - db.query(func.count(Comment.id)) - .join( - DatasetEntry, - (DatasetEntry.author_channel_id == Comment.author_channel_id) - & (DatasetEntry.dataset_id == ds.id), - ) - .filter(Comment.collection_id == ds.collection_id) + total_users = ( + db.query(func.count(DatasetEntry.id)) + .filter(DatasetEntry.dataset_id == ds.id) .scalar() ) - if total_comments == 0: + if total_users == 0: continue for annotator in annotators: @@ -437,14 +429,9 @@ def get_all_progress(db: Session) -> list[dict]: annotations = ( db.query(Annotation) - .join(Comment, Annotation.comment_id == Comment.id) - .join( - DatasetEntry, - (DatasetEntry.author_channel_id == Comment.author_channel_id) - & (DatasetEntry.dataset_id == ds.id), - ) + .join(DatasetEntry, Annotation.dataset_entry_id == DatasetEntry.id) .filter( - Comment.collection_id == ds.collection_id, + DatasetEntry.dataset_id == ds.id, Annotation.annotator_id == annotator.id, ) .all() @@ -460,11 +447,11 @@ def get_all_progress(db: Session) -> list[dict]: "annotator_name": annotator.name, "dataset_id": ds.id, "dataset_name": ds.name, - "total_comments": total_comments, + "total_users": total_users, "annotated": annotated, "bots": bots, "humans": humans, - "percent_complete": round(annotated / total_comments * 100, 1), + "percent_complete": round(annotated / total_users * 100, 1), } ) @@ -485,23 +472,22 @@ def import_annotations( errors = [] for item in annotations: - comment = db.query(Comment).filter(Comment.id == item.comment_db_id).first() - if comment is None: + entry = db.query(DatasetEntry).filter(DatasetEntry.id == item.entry_id).first() + if entry is None: skipped += 1 - errors.append(f"Comentário {item.comment_db_id} não encontrado.") + errors.append(f"Entrada {item.entry_id} não encontrada.") continue if item.label == "bot" and not (item.justificativa or "").strip(): skipped += 1 errors.append( - f"Comentário {item.comment_db_id}: " - "justificativa obrigatória para 'bot'." + f"Entrada {item.entry_id}: " "justificativa obrigatória para 'bot'." ) continue existing = ( db.query(Annotation) - .filter_by(comment_id=item.comment_db_id, annotator_id=annotator_id) + .filter_by(dataset_entry_id=item.entry_id, annotator_id=annotator_id) .first() ) @@ -512,7 +498,7 @@ def import_annotations( updated += 1 else: annotation = Annotation( - comment_id=item.comment_db_id, + dataset_entry_id=item.entry_id, annotator_id=annotator_id, label=item.label, justificativa=item.justificativa, @@ -563,22 +549,12 @@ def export_annotations_json( """Gerador de JSON streaming com anotações do pesquisador.""" query = ( db.query(Annotation) - .join(Comment, Annotation.comment_id == Comment.id) + .join(DatasetEntry, Annotation.dataset_entry_id == DatasetEntry.id) .filter(Annotation.annotator_id == annotator_id) ) if dataset_id: - query = ( - query.join( - DatasetEntry, - (DatasetEntry.author_channel_id == Comment.author_channel_id) - & (DatasetEntry.dataset_id == dataset_id), - ) - .join(Dataset, Dataset.id == DatasetEntry.dataset_id) - .filter( - Comment.collection_id == Dataset.collection_id, - ) - ) + query = query.filter(DatasetEntry.dataset_id == dataset_id) # Metadados do dataset se filtrado meta = {} @@ -606,9 +582,9 @@ def export_annotations_json( prefix = " " if first else ",\n " first = False item = { - "comment_db_id": str(ann.comment_id), - "author_channel_id": ann.comment.author_channel_id, - "text_original": ann.comment.text_original, + "entry_id": str(ann.dataset_entry_id), + "author_channel_id": ann.dataset_entry.author_channel_id, + "author_display_name": ann.dataset_entry.author_display_name, "label": ann.label, "justificativa": ann.justificativa, "annotated_at": ann.annotated_at.isoformat() + "Z" @@ -628,27 +604,24 @@ def export_annotations_csv( """Gerador de CSV streaming com anotações do pesquisador.""" query = ( db.query(Annotation) - .join(Comment, Annotation.comment_id == Comment.id) + .join(DatasetEntry, Annotation.dataset_entry_id == DatasetEntry.id) .filter(Annotation.annotator_id == annotator_id) ) if dataset_id: - query = ( - query.join( - DatasetEntry, - (DatasetEntry.author_channel_id == Comment.author_channel_id) - & (DatasetEntry.dataset_id == dataset_id), - ) - .join(Dataset, Dataset.id == DatasetEntry.dataset_id) - .filter( - Comment.collection_id == Dataset.collection_id, - ) - ) + query = query.filter(DatasetEntry.dataset_id == dataset_id) - yield "comment_db_id,label,justificativa\n" + yield "entry_id,author_channel_id,label,justificativa\n" for ann in query.yield_per(500): buf = io.StringIO() writer = csv.writer(buf) - writer.writerow([str(ann.comment_id), ann.label, ann.justificativa or ""]) + writer.writerow( + [ + str(ann.dataset_entry_id), + ann.dataset_entry.author_channel_id, + ann.label, + ann.justificativa or "", + ] + ) yield buf.getvalue() diff --git a/backend/services/dashboard.py b/backend/services/dashboard.py index b914611..3c3ab9f 100644 --- a/backend/services/dashboard.py +++ b/backend/services/dashboard.py @@ -1,8 +1,7 @@ """Serviço da US-06 — Dashboard de Análise. Agregações SQL + geração de gráficos Plotly (JSON). -Regra: nunca carregar registros em Python para calcular — -usar func.count, func.avg, GROUP BY no SQLAlchemy. +Unidade de análise: DatasetEntry (autor/canal do YouTube). """ import logging @@ -30,7 +29,6 @@ "sky": "#0ea5e9", } -# Layout base compartilhado por todos os gráficos _BASE_LAYOUT = { "font": {"family": "Inter, system-ui, sans-serif", "size": 12}, "paper_bgcolor": "rgba(0,0,0,0)", @@ -60,14 +58,13 @@ # ═══════════════════════════════════════════════════════════════════ -# Helpers — batch loading reutilizável +# Helpers — batch loading por entry (usuário) # ═══════════════════════════════════════════════════════════════════ def _get_datasets_filtered( db: Session, criteria: list[str] | None, video_id: str | None = None ) -> list[Dataset]: - """Retorna datasets, opcionalmente filtrados por critério e/ou vídeo.""" q = db.query(Dataset) if video_id: q = q.join(Collection, Dataset.collection_id == Collection.id).filter( @@ -84,119 +81,76 @@ def _get_datasets_filtered( return datasets -def _get_comment_ids_for_datasets( +def _get_entry_ids_for_datasets( db: Session, datasets: list[Dataset] -) -> tuple[ - dict[uuid.UUID, list[uuid.UUID]], - dict[uuid.UUID, tuple[uuid.UUID, str]], -]: - """Retorna (ds_id → [comment_ids], comment_id → (collection_id, author_channel_id)). - - Batch loading sem N+1. - """ +) -> dict[uuid.UUID, list[uuid.UUID]]: + """Retorna ds_id → [entry_ids].""" if not datasets: - return {}, {} + return {} ds_ids = [ds.id for ds in datasets] - - # entries → authors por dataset all_entries = ( - db.query(DatasetEntry.dataset_id, DatasetEntry.author_channel_id) + db.query(DatasetEntry.dataset_id, DatasetEntry.id) .filter(DatasetEntry.dataset_id.in_(ds_ids)) .all() ) - authors_by_ds: dict[uuid.UUID, list[str]] = defaultdict(list) - for dataset_id, author_channel_id in all_entries: - authors_by_ds[dataset_id].append(author_channel_id) - - # pares (collection_id, author_ids) para buscar comentários - ds_col_map = {ds.id: ds.collection_id for ds in datasets} - col_authors: dict[uuid.UUID, set[str]] = defaultdict(set) - for ds in datasets: - for author_id in authors_by_ds.get(ds.id, []): - col_authors[ds.collection_id].add(author_id) - - # batch: todos os comentários relevantes - comment_info: dict[uuid.UUID, tuple[uuid.UUID, str]] = {} - comments_by_col_author: dict[tuple[uuid.UUID, str], list[uuid.UUID]] = defaultdict( - list - ) - for col_id, author_set in col_authors.items(): - if not author_set: - continue - rows = ( - db.query(Comment.id, Comment.collection_id, Comment.author_channel_id) - .filter( - Comment.collection_id == col_id, - Comment.author_channel_id.in_(list(author_set)), - ) - .all() - ) - for cid, c_col_id, c_author_id in rows: - comment_info[cid] = (c_col_id, c_author_id) - comments_by_col_author[(c_col_id, c_author_id)].append(cid) + ds_entry_ids: dict[uuid.UUID, list[uuid.UUID]] = defaultdict(list) + for dataset_id, entry_id in all_entries: + ds_entry_ids[dataset_id].append(entry_id) - # montar comment_ids por dataset - ds_comment_ids: dict[uuid.UUID, list[uuid.UUID]] = {} - for ds in datasets: - ids: list[uuid.UUID] = [] - for author_id in authors_by_ds.get(ds.id, []): - ids.extend(comments_by_col_author.get((ds_col_map[ds.id], author_id), [])) - ds_comment_ids[ds.id] = ids - - return ds_comment_ids, comment_info + return ds_entry_ids def _get_annotations_and_conflicts( - db: Session, all_comment_ids: list[uuid.UUID] + db: Session, all_entry_ids: list[uuid.UUID] ) -> tuple[ dict[uuid.UUID, list[tuple[uuid.UUID, str]]], dict[uuid.UUID, tuple[str, str | None]], ]: - """Retorna anotações e conflitos por comment_id.""" - anns_by_comment: dict[uuid.UUID, list[tuple[uuid.UUID, str]]] = defaultdict(list) + """Retorna anotações e conflitos por entry_id.""" + anns_by_entry: dict[uuid.UUID, list[tuple[uuid.UUID, str]]] = defaultdict(list) conflict_map: dict[uuid.UUID, tuple[str, str | None]] = {} - if not all_comment_ids: - return anns_by_comment, conflict_map + if not all_entry_ids: + return anns_by_entry, conflict_map all_annotations = ( - db.query(Annotation.comment_id, Annotation.annotator_id, Annotation.label) - .filter(Annotation.comment_id.in_(all_comment_ids)) + db.query(Annotation.dataset_entry_id, Annotation.annotator_id, Annotation.label) + .filter(Annotation.dataset_entry_id.in_(all_entry_ids)) .all() ) - for comment_id, annotator_id, label in all_annotations: - anns_by_comment[comment_id].append((annotator_id, label)) + for entry_id, annotator_id, label in all_annotations: + anns_by_entry[entry_id].append((annotator_id, label)) all_conflicts = ( db.query( - AnnotationConflict.comment_id, + AnnotationConflict.dataset_entry_id, AnnotationConflict.status, AnnotationConflict.resolved_label, ) - .filter(AnnotationConflict.comment_id.in_(all_comment_ids)) + .filter(AnnotationConflict.dataset_entry_id.in_(all_entry_ids)) .all() ) - for comment_id, status, resolved_label in all_conflicts: - conflict_map[comment_id] = (status, resolved_label) + for entry_id, conflict_status, resolved_label in all_conflicts: + conflict_map[entry_id] = (conflict_status, resolved_label) - return anns_by_comment, conflict_map + return anns_by_entry, conflict_map -def _classify_comment( - cid: uuid.UUID, - anns_by_comment: dict[uuid.UUID, list[tuple[uuid.UUID, str]]], +def _classify_entry( + eid: uuid.UUID, + anns_by_entry: dict[uuid.UUID, list[tuple[uuid.UUID, str]]], conflict_map: dict[uuid.UUID, tuple[str, str | None]], ) -> str | None: - """Classifica um comentário: 'bot', 'humano', 'conflito' ou None (sem anotação).""" - anns = anns_by_comment.get(cid, []) + """Classifica um entry: 'bot', 'humano', 'conflito' ou None.""" + anns = anns_by_entry.get(eid, []) if not anns: return None - if cid in conflict_map: - status, resolved_label = conflict_map[cid] - if status == "resolved" and resolved_label: + if eid in conflict_map: + entry_status, resolved_label = conflict_map[eid] + if entry_status == "resolved" and resolved_label: return resolved_label return "conflito" @@ -207,14 +161,14 @@ def _classify_comment( def _compute_agreement_rate( - comment_ids: list[uuid.UUID], - anns_by_comment: dict[uuid.UUID, list[tuple[uuid.UUID, str]]], + entry_ids: list[uuid.UUID], + anns_by_entry: dict[uuid.UUID, list[tuple[uuid.UUID, str]]], ) -> float: """Agreement rate = consenso / total com 2+ anotações.""" with_two = 0 consensus = 0 - for cid in comment_ids: - anns = anns_by_comment.get(cid, []) + for eid in entry_ids: + anns = anns_by_entry.get(eid, []) if len(anns) >= 2: with_two += 1 labels = {label for _, label in anns} @@ -231,7 +185,6 @@ def _compute_agreement_rate( def _layout(**overrides) -> dict: - """Mescla layout base com overrides específicos do gráfico.""" layout = {**_BASE_LAYOUT} for key, val in overrides.items(): if isinstance(val, dict) and key in layout and isinstance(layout[key], dict): @@ -256,9 +209,7 @@ def _make_donut_chart(bots: int, humans: int, conflicts: int) -> str: textinfo="label+percent", textfont_size=12, hovertemplate=( - "%{label}
" - "%{value} comentários (%{percent})" - "" + "%{label}
" "%{value} usuários (%{percent})" "" ), pull=[0, 0.03, 0], ) @@ -320,7 +271,7 @@ def _make_comparativo_chart(datasets_data: list[dict]) -> str: }, yaxis={ "gridcolor": "#f1f5f9", - "title": {"text": "Comentários", "font": {"size": 11}}, + "title": {"text": "Usuários", "font": {"size": 11}}, }, ) ) @@ -411,7 +362,6 @@ def _make_bot_rate_chart(datasets_data: list[dict], orientation: str = "h") -> s def _make_criteria_effectiveness_chart(data: list[dict]) -> str: - """Bar horizontal simples: taxa de bots (%) por critério.""" criterios = [d["criteria"].capitalize() for d in data] rates = [round(d["bot_rate"] * 100, 1) for d in data] colors = [COLORS["bot"] if r > 10 else COLORS["teal"] for r in rates] @@ -446,10 +396,7 @@ def _make_criteria_effectiveness_chart(data: list[dict]) -> str: return pio.to_json(fig, validate=False) -def _make_agreement_by_dataset_chart( - datasets_data: list[dict], -) -> str: - """Bar horizontal: concordância (%) por dataset.""" +def _make_agreement_by_dataset_chart(datasets_data: list[dict]) -> str: names = [d["name"] for d in datasets_data] rates = [d["agreement_rate"] for d in datasets_data] colors = [ @@ -560,24 +507,19 @@ def _make_user_progress_chart(datasets_data: list[dict]) -> str: def get_global_dashboard(db: Session, criteria: list[str] | None = None) -> dict: """Seção 1 — Visão Geral.""" datasets = _get_datasets_filtered(db, criteria) - ds_comment_ids, comment_info = _get_comment_ids_for_datasets(db, datasets) - - all_cids = [] - for cids in ds_comment_ids.values(): - all_cids.extend(cids) - all_cids = list(set(all_cids)) + ds_entry_ids = _get_entry_ids_for_datasets(db, datasets) - anns_by_comment, conflict_map = _get_annotations_and_conflicts(db, all_cids) + all_eids = list({eid for eids in ds_entry_ids.values() for eid in eids}) + anns_by_entry, conflict_map = _get_annotations_and_conflicts(db, all_eids) - # KPIs total_bots = 0 total_humans = 0 total_conflicts = 0 pending_conflicts = 0 - annotated_cids: set[uuid.UUID] = set() + annotated_eids: set[uuid.UUID] = set() - for cid in all_cids: - classification = _classify_comment(cid, anns_by_comment, conflict_map) + for eid in all_eids: + classification = _classify_entry(eid, anns_by_entry, conflict_map) if classification == "bot": total_bots += 1 elif classification == "humano": @@ -585,37 +527,34 @@ def get_global_dashboard(db: Session, criteria: list[str] | None = None) -> dict elif classification == "conflito": total_conflicts += 1 - if anns_by_comment.get(cid): - annotated_cids.add(cid) + if anns_by_entry.get(eid): + annotated_eids.add(eid) - # conflitos totais e pendentes - for cid in all_cids: - if cid in conflict_map: - status, _ = conflict_map[cid] - if status == "pending": + for eid in all_eids: + if eid in conflict_map: + entry_status, _ = conflict_map[eid] + if entry_status == "pending": pending_conflicts += 1 - total_all_conflicts = sum(1 for cid in all_cids if cid in conflict_map) - agreement_rate = _compute_agreement_rate(all_cids, anns_by_comment) + total_all_conflicts = sum(1 for eid in all_eids if eid in conflict_map) + agreement_rate = _compute_agreement_rate(all_eids, anns_by_entry) - # Dados por dataset para gráficos datasets_chart_data = [] for ds in datasets: - cids = ds_comment_ids.get(ds.id, []) - bots = humans = conflicts = 0 - annotated = 0 - for cid in cids: - cl = _classify_comment(cid, anns_by_comment, conflict_map) + eids = ds_entry_ids.get(ds.id, []) + bots = humans = conflicts = annotated = 0 + for eid in eids: + cl = _classify_entry(eid, anns_by_entry, conflict_map) if cl == "bot": bots += 1 elif cl == "humano": humans += 1 elif cl == "conflito": conflicts += 1 - if anns_by_comment.get(cid): + if anns_by_entry.get(eid): annotated += 1 bot_rate = (bots / annotated * 100) if annotated > 0 else 0.0 - ds_agreement = _compute_agreement_rate(cids, anns_by_comment) + ds_agreement = _compute_agreement_rate(eids, anns_by_entry) datasets_chart_data.append( { "name": ds.name, @@ -627,17 +566,14 @@ def get_global_dashboard(db: Session, criteria: list[str] | None = None) -> dict } ) - # Timeline de anotações (agrupado por dia) - annotation_buckets = _get_annotation_timeline(db, all_cids) + annotation_buckets = _get_annotation_timeline(db, all_eids) - # Eficácia por critério criteria_data = _compute_criteria_effectiveness( - db, datasets, ds_comment_ids, anns_by_comment, conflict_map + db, datasets, ds_entry_ids, anns_by_entry, conflict_map ) - # Progresso geral - total_in_datasets = len(all_cids) - total_annotated_count = len(annotated_cids) + total_in_datasets = len(all_eids) + total_annotated_count = len(annotated_eids) annotation_progress = ( round(total_annotated_count / total_in_datasets * 100, 1) if total_in_datasets > 0 @@ -647,8 +583,8 @@ def get_global_dashboard(db: Session, criteria: list[str] | None = None) -> dict return { "summary": { "total_datasets": len(datasets), - "total_comments_annotated": total_annotated_count, - "total_comments_in_datasets": total_in_datasets, + "total_users_annotated": total_annotated_count, + "total_users_in_datasets": total_in_datasets, "annotation_progress": annotation_progress, "total_bots": total_bots, "total_humans": total_humans, @@ -676,7 +612,6 @@ def get_video_dashboard( db: Session, video_id: str, criteria: list[str] | None = None ) -> dict: """Seção 2 — Por Vídeo.""" - # Total de comentários coletados para este vídeo total_collected = ( db.query(func.count(Comment.id)) .join(Collection) @@ -685,10 +620,10 @@ def get_video_dashboard( ) or 0 datasets = _get_datasets_filtered(db, criteria, video_id=video_id) - ds_comment_ids, comment_info = _get_comment_ids_for_datasets(db, datasets) + ds_entry_ids = _get_entry_ids_for_datasets(db, datasets) - all_cids = list({cid for cids in ds_comment_ids.values() for cid in cids}) - anns_by_comment, conflict_map = _get_annotations_and_conflicts(db, all_cids) + all_eids = list({eid for eids in ds_entry_ids.values() for eid in eids}) + anns_by_entry, conflict_map = _get_annotations_and_conflicts(db, all_eids) total_bots = 0 total_humans = 0 @@ -696,36 +631,35 @@ def get_video_dashboard( pending_conflicts = 0 annotated_count = 0 - for cid in all_cids: - cl = _classify_comment(cid, anns_by_comment, conflict_map) + for eid in all_eids: + cl = _classify_entry(eid, anns_by_entry, conflict_map) if cl == "bot": total_bots += 1 elif cl == "humano": total_humans += 1 elif cl == "conflito": total_conflicts += 1 - if anns_by_comment.get(cid): + if anns_by_entry.get(eid): annotated_count += 1 - if cid in conflict_map and conflict_map[cid][0] == "pending": + if eid in conflict_map and conflict_map[eid][0] == "pending": pending_conflicts += 1 - all_conflicts_count = sum(1 for cid in all_cids if cid in conflict_map) - agreement_rate = _compute_agreement_rate(all_cids, anns_by_comment) + all_conflicts_count = sum(1 for eid in all_eids if eid in conflict_map) + agreement_rate = _compute_agreement_rate(all_eids, anns_by_entry) - # Dados por dataset datasets_chart_data = [] for ds in datasets: - cids = ds_comment_ids.get(ds.id, []) + eids = ds_entry_ids.get(ds.id, []) bots = humans = conflicts = annotated = 0 - for cid in cids: - cl = _classify_comment(cid, anns_by_comment, conflict_map) + for eid in eids: + cl = _classify_entry(eid, anns_by_entry, conflict_map) if cl == "bot": bots += 1 elif cl == "humano": humans += 1 elif cl == "conflito": conflicts += 1 - if anns_by_comment.get(cid): + if anns_by_entry.get(eid): annotated += 1 bot_rate = (bots / annotated * 100) if annotated > 0 else 0.0 datasets_chart_data.append( @@ -738,22 +672,18 @@ def get_video_dashboard( } ) - # Taxa de bots por critério neste vídeo criteria_rates = _compute_bot_rate_by_criteria( - datasets, ds_comment_ids, anns_by_comment, conflict_map + datasets, ds_entry_ids, anns_by_entry, conflict_map ) - # Timeline de comentários postados comment_timeline = _get_comment_published_timeline(db, video_id) - - # Destaques do vídeo highlights = _compute_video_highlights(db, video_id) return { "video_id": video_id, "summary": { "total_comments_collected": total_collected, - "total_comments_in_datasets": len(all_cids), + "total_users_in_datasets": len(all_eids), "total_annotated": annotated_count, "total_bots": total_bots, "total_humans": total_humans, @@ -776,50 +706,49 @@ def get_video_dashboard( def get_user_dashboard(db: Session, user_id: uuid.UUID) -> dict: """Seção 3 — Meu Progresso.""" - # Todos os datasets (sem filtro — o usuário pode anotar em qualquer um) datasets = db.query(Dataset).order_by(Dataset.created_at.desc()).all() if not datasets: return _empty_user_response() - ds_comment_ids, comment_info = _get_comment_ids_for_datasets(db, datasets) - all_cids = list({cid for cids in ds_comment_ids.values() for cid in cids}) + ds_entry_ids = _get_entry_ids_for_datasets(db, datasets) + all_eids = list({eid for eids in ds_entry_ids.values() for eid in eids}) - # Anotações do usuário autenticado + # Anotações do usuário autenticado (por entry) my_annotations: dict[uuid.UUID, str] = {} - if all_cids: + if all_eids: rows = ( - db.query(Annotation.comment_id, Annotation.label) + db.query(Annotation.dataset_entry_id, Annotation.label) .filter( - Annotation.comment_id.in_(all_cids), + Annotation.dataset_entry_id.in_(all_eids), Annotation.annotator_id == user_id, ) .all() ) - my_annotations = {cid: label for cid, label in rows} + my_annotations = {eid: label for eid, label in rows} # Conflitos gerados pelo usuário - my_conflict_cids: set[uuid.UUID] = set() - if all_cids: + my_conflict_eids: set[uuid.UUID] = set() + if all_eids: conflict_rows = ( - db.query(AnnotationConflict.comment_id) + db.query(AnnotationConflict.dataset_entry_id) .join(Annotation, AnnotationConflict.annotation_a_id == Annotation.id) .filter( - AnnotationConflict.comment_id.in_(all_cids), + AnnotationConflict.dataset_entry_id.in_(all_eids), Annotation.annotator_id == user_id, ) .all() ) - my_conflict_cids.update(r[0] for r in conflict_rows) + my_conflict_eids.update(r[0] for r in conflict_rows) conflict_rows_b = ( - db.query(AnnotationConflict.comment_id) + db.query(AnnotationConflict.dataset_entry_id) .join(Annotation, AnnotationConflict.annotation_b_id == Annotation.id) .filter( - AnnotationConflict.comment_id.in_(all_cids), + AnnotationConflict.dataset_entry_id.in_(all_eids), Annotation.annotator_id == user_id, ) .all() ) - my_conflict_cids.update(r[0] for r in conflict_rows_b) + my_conflict_eids.update(r[0] for r in conflict_rows_b) # collection_id por dataset ds_col_map = {ds.id: ds.collection_id for ds in datasets} @@ -833,7 +762,6 @@ def get_user_dashboard(db: Session, user_id: uuid.UUID) -> dict: ) col_video_map = {c_id: vid for c_id, vid in cols} - # Progresso por dataset ds_progress = [] total_annotated = 0 total_pending = 0 @@ -844,30 +772,28 @@ def get_user_dashboard(db: Session, user_id: uuid.UUID) -> dict: datasets_with_data = 0 for ds in datasets: - cids = ds_comment_ids.get(ds.id, []) - if not cids: + eids = ds_entry_ids.get(ds.id, []) + if not eids: continue - annotated_by_me = sum(1 for cid in cids if cid in my_annotations) - pending = len(cids) - annotated_by_me + annotated_by_me = sum(1 for eid in eids if eid in my_annotations) + pending = len(eids) - annotated_by_me - # Apenas contar o dataset se o usuário tem algo para anotar - # (todos os datasets são atribuídos a todos os anotadores) datasets_with_data += 1 - my_bots = sum(1 for cid in cids if my_annotations.get(cid) == "bot") - my_humans = sum(1 for cid in cids if my_annotations.get(cid) == "humano") - my_conflicts = sum(1 for cid in cids if cid in my_conflict_cids) + my_bots = sum(1 for eid in eids if my_annotations.get(eid) == "bot") + my_humans = sum(1 for eid in eids if my_annotations.get(eid) == "humano") + my_conflicts = sum(1 for eid in eids if eid in my_conflict_eids) - percent = round(annotated_by_me / len(cids) * 100, 1) if cids else 0.0 + percent = round(annotated_by_me / len(eids) * 100, 1) if eids else 0.0 - if annotated_by_me == len(cids): - status = "completed" + if annotated_by_me == len(eids): + ds_status = "completed" datasets_completed += 1 elif annotated_by_me > 0: - status = "in_progress" + ds_status = "in_progress" else: - status = "not_started" + ds_status = "not_started" total_annotated += annotated_by_me total_pending += pending @@ -880,22 +806,20 @@ def get_user_dashboard(db: Session, user_id: uuid.UUID) -> dict: "dataset_id": ds.id, "dataset_name": ds.name, "video_id": col_video_map.get(ds_col_map[ds.id], ""), - "total_comments": len(cids), + "total_users": len(eids), "annotated_by_me": annotated_by_me, "pending": pending, "percent_complete": percent, "my_bots": my_bots, "my_conflicts": my_conflicts, - "status": status, + "status": ds_status, } ) datasets_pending = datasets_with_data - datasets_completed - # Timeline de minhas anotações my_timeline = _get_user_annotation_timeline(db, user_id) - # Gráficos progress_chart_data = [ {"name": d["dataset_name"], "percent": d["percent_complete"]} for d in ds_progress @@ -921,7 +845,7 @@ def get_user_dashboard(db: Session, user_id: uuid.UUID) -> dict: } -def get_bot_comments( +def get_bot_users( db: Session, dataset_id: str | None = None, video_id: str | None = None, @@ -931,23 +855,17 @@ def get_bot_comments( page: int = 1, page_size: int = 20, ) -> dict: - """Tabela de comentários classificados como bot.""" + """Tabela de usuários classificados como bot.""" q = ( db.query( Dataset.name.label("dataset_name"), Dataset.id.label("dataset_id"), - Comment.author_display_name, - Comment.author_channel_id, - Comment.text_original, - Annotation.comment_id, + DatasetEntry.id.label("entry_id"), + DatasetEntry.author_display_name, + DatasetEntry.author_channel_id, ) .join(DatasetEntry, DatasetEntry.dataset_id == Dataset.id) - .join( - Comment, - (Comment.collection_id == Dataset.collection_id) - & (Comment.author_channel_id == DatasetEntry.author_channel_id), - ) - .join(Annotation, Annotation.comment_id == Comment.id) + .join(Annotation, Annotation.dataset_entry_id == DatasetEntry.id) .filter(Annotation.label == "bot") ) @@ -958,42 +876,40 @@ def get_bot_comments( Collection.video_id == video_id ) if author: - q = q.filter(Comment.author_display_name.ilike(f"%{author}%")) - if search: - q = q.filter(Comment.text_original.ilike(f"%{search}%")) + q = q.filter(DatasetEntry.author_display_name.ilike(f"%{author}%")) if criteria_filter: for crit in criteria_filter: q = q.filter(Dataset.criteria_applied.any(crit)) - q = q.distinct(Annotation.comment_id) + q = q.distinct(DatasetEntry.id) total = q.count() rows = ( - q.order_by(Annotation.comment_id) + q.order_by(DatasetEntry.id) .offset((page - 1) * page_size) .limit(page_size) .all() ) - comment_ids = [r.comment_id for r in rows] + entry_ids = [r.entry_id for r in rows] # Batch: conflitos conflict_status_map: dict[uuid.UUID, str] = {} - if comment_ids: + if entry_ids: conflicts = ( - db.query(AnnotationConflict.comment_id, AnnotationConflict.status) - .filter(AnnotationConflict.comment_id.in_(comment_ids)) + db.query(AnnotationConflict.dataset_entry_id, AnnotationConflict.status) + .filter(AnnotationConflict.dataset_entry_id.in_(entry_ids)) .all() ) - conflict_status_map = {cid: st for cid, st in conflicts} + conflict_status_map = {eid: st for eid, st in conflicts} # Batch: concordância + nº de anotadores concordance_map: dict[uuid.UUID, int] = {} annotators_map: dict[uuid.UUID, int] = {} - if comment_ids: + if entry_ids: ann_counts = ( db.query( - Annotation.comment_id, + Annotation.dataset_entry_id, func.count(Annotation.id).label("total"), func.count( case( @@ -1001,16 +917,16 @@ def get_bot_comments( ) ).label("bot_count"), ) - .filter(Annotation.comment_id.in_(comment_ids)) - .group_by(Annotation.comment_id) + .filter(Annotation.dataset_entry_id.in_(entry_ids)) + .group_by(Annotation.dataset_entry_id) .all() ) - for cid, total_anns, bot_count in ann_counts: - annotators_map[cid] = total_anns + for eid, total_anns, bot_count in ann_counts: + annotators_map[eid] = total_anns if total_anns > 0: - concordance_map[cid] = round(bot_count / total_anns * 100) + concordance_map[eid] = round(bot_count / total_anns * 100) - # Batch: critérios que flagaram cada autor (via DatasetEntry) + # Batch: critérios + comment_count ds_ids = list({r.dataset_id for r in rows}) author_ids = list({r.author_channel_id for r in rows if r.author_channel_id}) criteria_map: dict[str, list[str]] = {} @@ -1033,16 +949,37 @@ def get_bot_comments( if c not in criteria_map[aid]: criteria_map[aid].append(c) + # Comment counts + cc_map: dict[uuid.UUID, int] = {} + if entry_ids: + comment_counts = ( + db.query( + DatasetEntry.id, + func.count(Comment.id).label("cc"), + ) + .join(Dataset, Dataset.id == DatasetEntry.dataset_id) + .join( + Comment, + (Comment.collection_id == Dataset.collection_id) + & (Comment.author_channel_id == DatasetEntry.author_channel_id), + ) + .filter(DatasetEntry.id.in_(entry_ids)) + .group_by(DatasetEntry.id) + .all() + ) + cc_map = {eid: cc for eid, cc in comment_counts} + items = [] for row in rows: items.append( { "dataset_name": row.dataset_name, "author_display_name": row.author_display_name, - "text_original": row.text_original, - "concordance_pct": concordance_map.get(row.comment_id, 0), - "conflict_status": conflict_status_map.get(row.comment_id), - "annotators_count": annotators_map.get(row.comment_id, 0), + "author_channel_id": row.author_channel_id, + "comment_count": cc_map.get(row.entry_id, 0), + "concordance_pct": concordance_map.get(row.entry_id, 0), + "conflict_status": conflict_status_map.get(row.entry_id), + "annotators_count": annotators_map.get(row.entry_id, 0), "criteria": criteria_map.get(row.author_channel_id, []), } ) @@ -1051,17 +988,16 @@ def get_bot_comments( def get_criteria_effectiveness(db: Session, video_id: str | None = None) -> list[dict]: - """Eficácia de cada critério de limpeza.""" datasets = _get_datasets_filtered(db, criteria=None, video_id=video_id) if not datasets: return [] - ds_comment_ids, _ = _get_comment_ids_for_datasets(db, datasets) - all_cids = list({cid for cids in ds_comment_ids.values() for cid in cids}) - anns_by_comment, conflict_map = _get_annotations_and_conflicts(db, all_cids) + ds_entry_ids = _get_entry_ids_for_datasets(db, datasets) + all_eids = list({eid for eids in ds_entry_ids.values() for eid in eids}) + anns_by_entry, conflict_map = _get_annotations_and_conflicts(db, all_eids) return _compute_criteria_effectiveness( - db, datasets, ds_comment_ids, anns_by_comment, conflict_map + db, datasets, ds_entry_ids, anns_by_entry, conflict_map ) @@ -1070,16 +1006,15 @@ def get_criteria_effectiveness(db: Session, video_id: str | None = None) -> list # ═══════════════════════════════════════════════════════════════════ -def _get_annotation_timeline(db: Session, comment_ids: list[uuid.UUID]) -> list[dict]: - """Anotações agrupadas por dia.""" - if not comment_ids: +def _get_annotation_timeline(db: Session, entry_ids: list[uuid.UUID]) -> list[dict]: + if not entry_ids: return [] rows = ( db.query( cast(Annotation.annotated_at, Date).label("day"), func.count(Annotation.id), ) - .filter(Annotation.comment_id.in_(comment_ids)) + .filter(Annotation.dataset_entry_id.in_(entry_ids)) .group_by("day") .order_by("day") .all() @@ -1088,7 +1023,6 @@ def _get_annotation_timeline(db: Session, comment_ids: list[uuid.UUID]) -> list[ def _get_user_annotation_timeline(db: Session, user_id: uuid.UUID) -> list[dict]: - """Anotações do usuário agrupadas por dia.""" rows = ( db.query( cast(Annotation.annotated_at, Date).label("day"), @@ -1103,7 +1037,6 @@ def _get_user_annotation_timeline(db: Session, user_id: uuid.UUID) -> list[dict] def _get_comment_published_timeline(db: Session, video_id: str) -> list[dict]: - """Comentários publicados agrupados por dia para um vídeo.""" rows = ( db.query( cast(Comment.published_at, Date).label("day"), @@ -1120,24 +1053,23 @@ def _get_comment_published_timeline(db: Session, video_id: str) -> list[dict]: def _compute_bot_rate_by_criteria( datasets: list[Dataset], - ds_comment_ids: dict[uuid.UUID, list[uuid.UUID]], - anns_by_comment: dict[uuid.UUID, list[tuple[uuid.UUID, str]]], + ds_entry_ids: dict[uuid.UUID, list[uuid.UUID]], + anns_by_entry: dict[uuid.UUID, list[tuple[uuid.UUID, str]]], conflict_map: dict[uuid.UUID, tuple[str, str | None]], ) -> list[dict]: - """Taxa de bots agrupada por critério para gráfico horizontal.""" criteria_stats: dict[str, dict] = {} for ds in datasets: - cids = ds_comment_ids.get(ds.id, []) - if not cids: + eids = ds_entry_ids.get(ds.id, []) + if not eids: continue bots = sum( 1 - for cid in cids - if _classify_comment(cid, anns_by_comment, conflict_map) == "bot" + for eid in eids + if _classify_entry(eid, anns_by_entry, conflict_map) == "bot" ) - annotated = sum(1 for cid in cids if anns_by_comment.get(cid)) + annotated = sum(1 for eid in eids if anns_by_entry.get(eid)) for crit in ds.criteria_applied or []: if crit not in criteria_stats: @@ -1157,32 +1089,30 @@ def _compute_bot_rate_by_criteria( def _compute_criteria_effectiveness( db: Session, datasets: list[Dataset], - ds_comment_ids: dict[uuid.UUID, list[uuid.UUID]], - anns_by_comment: dict[uuid.UUID, list[tuple[uuid.UUID, str]]], + ds_entry_ids: dict[uuid.UUID, list[uuid.UUID]], + anns_by_entry: dict[uuid.UUID, list[tuple[uuid.UUID, str]]], conflict_map: dict[uuid.UUID, tuple[str, str | None]], ) -> list[dict]: - """Calcula eficácia de cada critério: datasets, comentários, bots, taxa.""" criteria_stats: dict[str, dict] = {} for ds in datasets: - cids = ds_comment_ids.get(ds.id, []) + eids = ds_entry_ids.get(ds.id, []) bots = sum( 1 - for cid in cids - if _classify_comment(cid, anns_by_comment, conflict_map) == "bot" + for eid in eids + if _classify_entry(eid, anns_by_entry, conflict_map) == "bot" ) for crit in ds.criteria_applied or []: if crit not in criteria_stats: criteria_stats[crit] = { "total_datasets": 0, - "total_comments_selected": 0, + "total_users_selected": 0, "total_bots": 0, } criteria_stats[crit]["total_datasets"] += 1 - criteria_stats[crit]["total_comments_selected"] += len(cids) + criteria_stats[crit]["total_users_selected"] += len(eids) criteria_stats[crit]["total_bots"] += bots - # Ordenar: numéricos primeiro, depois comportamentais ordered_criteria = [ "percentil", "media", @@ -1200,8 +1130,8 @@ def _compute_criteria_effectiveness( continue stats = criteria_stats[crit] bot_rate = ( - stats["total_bots"] / stats["total_comments_selected"] - if stats["total_comments_selected"] > 0 + stats["total_bots"] / stats["total_users_selected"] + if stats["total_users_selected"] > 0 else 0.0 ) result.append( @@ -1209,7 +1139,7 @@ def _compute_criteria_effectiveness( "criteria": crit, "group": CRITERIA_GROUPS.get(crit, "outro"), "total_datasets": stats["total_datasets"], - "total_comments_selected": stats["total_comments_selected"], + "total_users_selected": stats["total_users_selected"], "total_bots": stats["total_bots"], "bot_rate": round(bot_rate, 4), } @@ -1218,12 +1148,9 @@ def _compute_criteria_effectiveness( def _compute_video_highlights(db: Session, video_id: str) -> list[dict]: - """Destaques estatísticos do vídeo baseados nos comentários coletados.""" base = db.query(Comment).join(Collection).filter(Collection.video_id == video_id) - highlights: list[dict] = [] - # 1. Autor com mais comentários top_author = ( db.query( Comment.author_display_name, @@ -1244,7 +1171,6 @@ def _compute_video_highlights(db: Session, video_id: str) -> list[dict]: } ) - # 2. Comentário com mais respostas top_replies = base.order_by(Comment.reply_count.desc()).first() if top_replies and top_replies.reply_count > 0: text = top_replies.text_original @@ -1257,7 +1183,6 @@ def _compute_video_highlights(db: Session, video_id: str) -> list[dict]: } ) - # 3. Comentário com mais likes top_likes = base.order_by(Comment.like_count.desc()).first() if top_likes and top_likes.like_count > 0: text = top_likes.text_original @@ -1270,7 +1195,6 @@ def _compute_video_highlights(db: Session, video_id: str) -> list[dict]: } ) - # 4. Conta mais nova (canal criado mais recentemente) newest = ( base.filter(Comment.author_channel_published_at.isnot(None)) .order_by(Comment.author_channel_published_at.desc()) @@ -1286,7 +1210,6 @@ def _compute_video_highlights(db: Session, video_id: str) -> list[dict]: } ) - # 5. Conta mais antiga oldest = ( base.filter(Comment.author_channel_published_at.isnot(None)) .order_by(Comment.author_channel_published_at.asc()) @@ -1306,7 +1229,6 @@ def _compute_video_highlights(db: Session, video_id: str) -> list[dict]: } ) - # 6. Média de likes por comentário avg_likes = ( db.query(func.avg(Comment.like_count)) .join(Collection) @@ -1326,7 +1248,6 @@ def _compute_video_highlights(db: Session, video_id: str) -> list[dict]: def _empty_user_response() -> dict: - """Resposta vazia para quando não há datasets.""" return { "summary": { "total_datasets_assigned": 0, diff --git a/backend/services/data.py b/backend/services/data.py index 3c34f89..912d34f 100644 --- a/backend/services/data.py +++ b/backend/services/data.py @@ -198,10 +198,9 @@ def list_all_datasets(db: Session) -> list[dict]: def get_annotation_progress(db: Session) -> list[dict]: - """Progresso de anotação por dataset. + """Progresso de anotação por dataset (unidade: entry/usuário). - Otimizado: número constante de queries independente do número de datasets - e comentários. Sem loops N+1. + Otimizado: número constante de queries independente do número de datasets. """ datasets = db.query(Dataset).order_by(Dataset.created_at.desc()).all() @@ -210,172 +209,104 @@ def get_annotation_progress(db: Session) -> list[dict]: ds_ids = [ds.id for ds in datasets] - # 1) Batch: entries → authors por dataset + # 1) Batch: entries por dataset all_entries = ( - db.query(DatasetEntry.dataset_id, DatasetEntry.author_channel_id) + db.query(DatasetEntry.dataset_id, DatasetEntry.id) .filter(DatasetEntry.dataset_id.in_(ds_ids)) .all() ) - authors_by_ds: dict[uuid.UUID, list[str]] = defaultdict(list) - for dataset_id, author_channel_id in all_entries: - authors_by_ds[dataset_id].append(author_channel_id) - - # 2) Batch: todos os comentários relevantes (id, collection_id, author_channel_id) - # Monta pares (collection_id, [author_ids]) para buscar de uma vez - col_authors: dict[uuid.UUID, set[str]] = defaultdict(set) - ds_col_map = {ds.id: ds.collection_id for ds in datasets} - for ds in datasets: - for author_id in authors_by_ds.get(ds.id, []): - col_authors[ds.collection_id].add(author_id) - - # Buscar todos os comentários necessários em batch por collection - # comment_id → (collection_id, author_channel_id) - comment_info: dict[uuid.UUID, tuple[uuid.UUID, str]] = {} - # (collection_id, author_channel_id) → [comment_ids] - comments_by_col_author: dict[tuple[uuid.UUID, str], list[uuid.UUID]] = defaultdict( - list - ) + entries_by_ds: dict[uuid.UUID, list[uuid.UUID]] = defaultdict(list) + for dataset_id, entry_id in all_entries: + entries_by_ds[dataset_id].append(entry_id) - for col_id, author_set in col_authors.items(): - if not author_set: - continue - rows = ( - db.query(Comment.id, Comment.collection_id, Comment.author_channel_id) - .filter( - Comment.collection_id == col_id, - Comment.author_channel_id.in_(list(author_set)), - ) - .all() - ) - for cid, c_col_id, c_author_id in rows: - comment_info[cid] = (c_col_id, c_author_id) - comments_by_col_author[(c_col_id, c_author_id)].append(cid) + all_entry_ids = [eid for eids in entries_by_ds.values() for eid in eids] - # Coletar todos os comment_ids relevantes - all_comment_ids = list(comment_info.keys()) - - # 3) Batch: anotações (comment_id, annotator_id, label) - all_annotations: list[tuple[uuid.UUID, uuid.UUID, str]] = [] - if all_comment_ids: + # 2) Batch: anotações (entry_id, annotator_id, label) + anns_by_entry: dict[uuid.UUID, list[tuple[uuid.UUID, str]]] = defaultdict(list) + if all_entry_ids: all_annotations = ( db.query( - Annotation.comment_id, + Annotation.dataset_entry_id, Annotation.annotator_id, Annotation.label, ) - .filter(Annotation.comment_id.in_(all_comment_ids)) + .filter(Annotation.dataset_entry_id.in_(all_entry_ids)) .all() ) + for entry_id, annotator_id, label in all_annotations: + anns_by_entry[entry_id].append((annotator_id, label)) - # Agrupar: comment_id → [(annotator_id, label)] - anns_by_comment: dict[uuid.UUID, list[tuple[uuid.UUID, str]]] = defaultdict(list) - for comment_id, annotator_id, label in all_annotations: - anns_by_comment[comment_id].append((annotator_id, label)) - - # 4) Batch: conflitos (comment_id, status, resolved_label) - all_conflicts: list[tuple[uuid.UUID, str, str | None]] = [] - if all_comment_ids: + # 3) Batch: conflitos (entry_id, status, resolved_label) + conflict_map: dict[uuid.UUID, tuple[str, str | None]] = {} + if all_entry_ids: all_conflicts = ( db.query( - AnnotationConflict.comment_id, + AnnotationConflict.dataset_entry_id, AnnotationConflict.status, AnnotationConflict.resolved_label, ) - .filter(AnnotationConflict.comment_id.in_(all_comment_ids)) + .filter(AnnotationConflict.dataset_entry_id.in_(all_entry_ids)) .all() ) + for entry_id, conflict_status, resolved_label in all_conflicts: + conflict_map[entry_id] = (conflict_status, resolved_label) - # conflict_comment_id → (status, resolved_label) - conflict_map: dict[uuid.UUID, tuple[str, str | None]] = {} - for comment_id, status, resolved_label in all_conflicts: - conflict_map[comment_id] = (status, resolved_label) - - # 5) Montar resultados por dataset — tudo em Python, sem mais queries + # 4) Montar resultados por dataset results = [] for ds in datasets: - ds_authors = authors_by_ds.get(ds.id, []) + ds_eids = entries_by_ds.get(ds.id, []) + total_users = len(ds_eids) - if not ds_authors: + if not ds_eids: results.append( { "dataset_id": ds.id, "dataset_name": ds.name, - "total": 0, - "annotated": 0, - "pending": 0, + "total_users": 0, + "annotated_users": 0, + "pending_users": 0, "conflicts": 0, "conflicts_resolved": 0, "annotators_count": 0, - "bots_users": 0, - "bots_comments": 0, + "bots": 0, } ) continue - # Comentários deste dataset - ds_comment_ids: list[uuid.UUID] = [] - for author_id in ds_authors: - ds_comment_ids.extend( - comments_by_col_author.get((ds_col_map[ds.id], author_id), []) - ) - - total = len(ds_comment_ids) - - # Anotados: comentários com pelo menos uma anotação - annotated_set: set[uuid.UUID] = set() + annotated_users = 0 annotator_ids: set[uuid.UUID] = set() - for cid in ds_comment_ids: - anns = anns_by_comment.get(cid) + conflicts = 0 + conflicts_resolved = 0 + bots = 0 + + for eid in ds_eids: + anns = anns_by_entry.get(eid, []) if anns: - annotated_set.add(cid) + annotated_users += 1 for annotator_id, _ in anns: annotator_ids.add(annotator_id) - annotated = len(annotated_set) - - # Conflitos - conflicts = 0 - conflicts_resolved = 0 - for cid in ds_comment_ids: - if cid in conflict_map: + if eid in conflict_map: conflicts += 1 - if conflict_map[cid][0] == "resolved": + c_status, c_resolved = conflict_map[eid] + if c_status == "resolved": conflicts_resolved += 1 - - # Bots: consenso "bot" (sem conflito) OU conflito resolvido como "bot" - bots_comments = 0 - bot_author_ids: set[str] = set() - for cid in ds_comment_ids: - anns = anns_by_comment.get(cid, []) - if not anns: - continue - - if cid in conflict_map: - # Tem conflito — só conta se resolvido como bot - status, resolved_label = conflict_map[cid] - if status == "resolved" and resolved_label == "bot": - bots_comments += 1 - _, author_id = comment_info[cid] - bot_author_ids.add(author_id) - else: - # Sem conflito — consenso - if all(label == "bot" for _, label in anns): - bots_comments += 1 - _, author_id = comment_info[cid] - bot_author_ids.add(author_id) + if c_resolved == "bot": + bots += 1 + elif anns and all(label == "bot" for _, label in anns): + bots += 1 results.append( { "dataset_id": ds.id, "dataset_name": ds.name, - "total": total, - "annotated": annotated, - "pending": total - annotated, + "total_users": total_users, + "annotated_users": annotated_users, + "pending_users": total_users - annotated_users, "conflicts": conflicts, "conflicts_resolved": conflicts_resolved, "annotators_count": len(annotator_ids), - "bots_users": len(bot_author_ids), - "bots_comments": bots_comments, + "bots": bots, } ) diff --git a/backend/services/review.py b/backend/services/review.py index 0a53080..1709e52 100644 --- a/backend/services/review.py +++ b/backend/services/review.py @@ -1,4 +1,4 @@ -"""Serviço da US-05 — revisão de conflitos e desempate.""" +"""Serviço da US-05 — revisão de conflitos e desempate por usuário.""" import json import logging @@ -30,19 +30,10 @@ def list_conflicts( page: int = 1, page_size: int = 20, ) -> dict: - # Query filtrada com JOINs no SQL (sem carregar tudo em Python) query = ( db.query(AnnotationConflict) - .join(Comment, AnnotationConflict.comment_id == Comment.id) - .join( - DatasetEntry, - DatasetEntry.author_channel_id == Comment.author_channel_id, - ) - .join( - Dataset, - (Dataset.id == DatasetEntry.dataset_id) - & (Dataset.collection_id == Comment.collection_id), - ) + .join(DatasetEntry, AnnotationConflict.dataset_entry_id == DatasetEntry.id) + .join(Dataset, Dataset.id == DatasetEntry.dataset_id) ) if conflict_status: @@ -66,39 +57,44 @@ def list_conflicts( if not conflicts: return _empty_page(page, page_size, total) - # Batch load apenas para a página + # Batch load ann_ids = set() - comment_ids = set() + entry_ids = set() for c in conflicts: ann_ids.update([c.annotation_a_id, c.annotation_b_id]) - comment_ids.add(c.comment_id) + entry_ids.add(c.dataset_entry_id) annotations = db.query(Annotation).filter(Annotation.id.in_(ann_ids)).all() ann_map = {a.id: a for a in annotations} - comments = db.query(Comment).filter(Comment.id.in_(comment_ids)).all() - comment_map = {c.id: c for c in comments} + entries = db.query(DatasetEntry).filter(DatasetEntry.id.in_(entry_ids)).all() + entry_map = {e.id: e for e in entries} user_ids = {a.annotator_id for a in annotations} users = db.query(User).filter(User.id.in_(user_ids)).all() user_map = {u.id: u for u in users} - author_ids = {c.author_channel_id for c in comments} - collection_ids = {c.collection_id for c in comments} - entries_with_ds = ( - db.query(DatasetEntry, Dataset) + ds_ids = {e.dataset_id for e in entries} + datasets = db.query(Dataset).filter(Dataset.id.in_(ds_ids)).all() + ds_map = {d.id: d for d in datasets} + + # Comment counts por entry + comment_counts = ( + db.query( + DatasetEntry.id, + func.count(Comment.id).label("cc"), + ) .join(Dataset, Dataset.id == DatasetEntry.dataset_id) - .filter( - DatasetEntry.author_channel_id.in_(author_ids), - Dataset.collection_id.in_(collection_ids), + .join( + Comment, + (Comment.collection_id == Dataset.collection_id) + & (Comment.author_channel_id == DatasetEntry.author_channel_id), ) + .filter(DatasetEntry.id.in_(entry_ids)) + .group_by(DatasetEntry.id) .all() ) - ds_map: dict[tuple, Dataset] = {} - for entry, ds in entries_with_ds: - key = (entry.author_channel_id, ds.collection_id) - if key not in ds_map: - ds_map[key] = ds + cc_map = {eid: cc for eid, cc in comment_counts} items = [] for c in conflicts: @@ -107,23 +103,23 @@ def list_conflicts( if not ann_a or not ann_b: continue - comment = comment_map.get(c.comment_id) - if not comment: + entry = entry_map.get(c.dataset_entry_id) + if not entry: continue - ds = ds_map.get((comment.author_channel_id, comment.collection_id)) - + ds = ds_map.get(entry.dataset_id) annotator_a = user_map.get(ann_a.annotator_id) annotator_b = user_map.get(ann_b.annotator_id) items.append( { "conflict_id": c.id, - "comment_id": c.comment_id, + "entry_id": c.dataset_entry_id, "dataset_id": ds.id if ds else None, "dataset_name": ds.name if ds else "", - "author_display_name": comment.author_display_name, - "text_original": comment.text_original, + "author_display_name": entry.author_display_name, + "author_channel_id": entry.author_channel_id, + "comment_count": cc_map.get(entry.id, 0), "label_a": ann_a.label, "annotator_a": annotator_a.name if annotator_a else "", "justificativa_a": ann_a.justificativa, @@ -156,22 +152,6 @@ def _empty_page(page: int, page_size: int, total: int = 0) -> dict: } -def _find_dataset_for_comment(db: Session, comment: Comment): - """Encontra o dataset que contém o autor deste comentário.""" - entry = ( - db.query(DatasetEntry) - .join(Dataset, Dataset.id == DatasetEntry.dataset_id) - .filter( - DatasetEntry.author_channel_id == comment.author_channel_id, - Dataset.collection_id == comment.collection_id, - ) - .first() - ) - if not entry: - return None - return db.query(Dataset).filter(Dataset.id == entry.dataset_id).first() - - # ─── Detalhe de um conflito ────────────────────────────────────────────────── @@ -192,35 +172,24 @@ def get_conflict_detail(db: Session, conflict_id: uuid.UUID) -> dict: ann_b = ( db.query(Annotation).filter(Annotation.id == conflict.annotation_b_id).first() ) - comment = db.query(Comment).filter(Comment.id == conflict.comment_id).first() + entry = ( + db.query(DatasetEntry) + .filter(DatasetEntry.id == conflict.dataset_entry_id) + .first() + ) + dataset = db.query(Dataset).filter(Dataset.id == entry.dataset_id).first() - # Find the author's other comments in the same collection + # Todos os comentários do autor como evidências all_comments = ( db.query(Comment) .filter( - Comment.collection_id == comment.collection_id, - Comment.author_channel_id == comment.author_channel_id, + Comment.collection_id == dataset.collection_id, + Comment.author_channel_id == entry.author_channel_id, ) .order_by(Comment.published_at.asc()) .all() ) - # Find dataset name - entry = ( - db.query(DatasetEntry) - .join(Dataset, Dataset.id == DatasetEntry.dataset_id) - .filter( - DatasetEntry.author_channel_id == comment.author_channel_id, - Dataset.collection_id == comment.collection_id, - ) - .first() - ) - dataset = ( - db.query(Dataset).filter(Dataset.id == entry.dataset_id).first() - if entry - else None - ) - annotator_a = db.query(User).filter(User.id == ann_a.annotator_id).first() annotator_b = db.query(User).filter(User.id == ann_b.annotator_id).first() @@ -233,8 +202,8 @@ def get_conflict_detail(db: Session, conflict_id: uuid.UUID) -> dict: "conflict_id": conflict.id, "status": conflict.status, "dataset_name": dataset.name if dataset else "", - "author_channel_id": comment.author_channel_id, - "author_display_name": comment.author_display_name, + "author_channel_id": entry.author_channel_id, + "author_display_name": entry.author_display_name, "comments": [ { "comment_db_id": c.id, @@ -290,13 +259,11 @@ def resolve_conflict( now = datetime.utcnow() - # Update conflict record conflict.status = "resolved" conflict.resolved_by = admin_id conflict.resolved_label = resolved_label conflict.resolved_at = now - # Insert immutable resolution record resolution = Resolution( conflict_id=conflict_id, resolved_label=resolved_label, @@ -324,7 +291,7 @@ def resolve_conflict( } -# ─── Listar bots ───────────────────────────────────────────────────────────── +# ─── Listar bots (por usuário) ────────────────────────────────────────────── def list_bots( @@ -335,27 +302,18 @@ def list_bots( page: int = 1, page_size: int = 20, ) -> dict: - """Lista comentários com pelo menos uma anotação 'bot'.""" - bot_comment_ids = ( - db.query(Annotation.comment_id) + """Lista entries (usuários do YouTube) com pelo menos uma anotação 'bot'.""" + bot_entry_ids = ( + db.query(Annotation.dataset_entry_id) .filter(Annotation.label == "bot") .distinct() .subquery() ) - # Query filtrada com JOINs no SQL query = ( - db.query(Comment) - .filter(Comment.id.in_(bot_comment_ids.select())) - .join( - DatasetEntry, - DatasetEntry.author_channel_id == Comment.author_channel_id, - ) - .join( - Dataset, - (Dataset.id == DatasetEntry.dataset_id) - & (Dataset.collection_id == Comment.collection_id), - ) + db.query(DatasetEntry) + .filter(DatasetEntry.id.in_(bot_entry_ids.select())) + .join(Dataset, Dataset.id == DatasetEntry.dataset_id) ) if dataset_id: @@ -367,42 +325,29 @@ def list_bots( total = query.count() offset = (page - 1) * page_size - comments = ( - query.order_by(Comment.published_at.desc()) + entries = ( + query.order_by(DatasetEntry.author_display_name) .offset(offset) .limit(page_size) .all() ) - if not comments: + if not entries: return _empty_page(page, page_size, total) - # Batch load apenas para a página - comment_ids = [c.id for c in comments] - author_ids = {c.author_channel_id for c in comments} - collection_ids = {c.collection_id for c in comments} + # Batch load + entry_ids = [e.id for e in entries] + ds_ids = {e.dataset_id for e in entries} - entries_with_ds = ( - db.query(DatasetEntry, Dataset) - .join(Dataset, Dataset.id == DatasetEntry.dataset_id) - .filter( - DatasetEntry.author_channel_id.in_(author_ids), - Dataset.collection_id.in_(collection_ids), - ) - .all() - ) - ds_map: dict[tuple, Dataset] = {} - for entry, ds in entries_with_ds: - key = (entry.author_channel_id, ds.collection_id) - if key not in ds_map: - ds_map[key] = ds + datasets = db.query(Dataset).filter(Dataset.id.in_(ds_ids)).all() + ds_map = {d.id: d for d in datasets} all_annotations = ( - db.query(Annotation).filter(Annotation.comment_id.in_(comment_ids)).all() + db.query(Annotation).filter(Annotation.dataset_entry_id.in_(entry_ids)).all() ) - ann_by_comment: dict[uuid.UUID, list[Annotation]] = {} + ann_by_entry: dict[uuid.UUID, list[Annotation]] = {} for a in all_annotations: - ann_by_comment.setdefault(a.comment_id, []).append(a) + ann_by_entry.setdefault(a.dataset_entry_id, []).append(a) user_ids = {a.annotator_id for a in all_annotations} users = db.query(User).filter(User.id.in_(user_ids)).all() @@ -410,35 +355,52 @@ def list_bots( all_conflicts = ( db.query(AnnotationConflict) - .filter(AnnotationConflict.comment_id.in_(comment_ids)) + .filter(AnnotationConflict.dataset_entry_id.in_(entry_ids)) .all() ) - conflict_map = {c.comment_id: c for c in all_conflicts} + conflict_map = {c.dataset_entry_id: c for c in all_conflicts} - items = [] - for comment in comments: - ds = ds_map.get((comment.author_channel_id, comment.collection_id)) + # Comment counts + comment_counts = ( + db.query( + DatasetEntry.id, + func.count(Comment.id).label("cc"), + ) + .join(Dataset, Dataset.id == DatasetEntry.dataset_id) + .join( + Comment, + (Comment.collection_id == Dataset.collection_id) + & (Comment.author_channel_id == DatasetEntry.author_channel_id), + ) + .filter(DatasetEntry.id.in_(entry_ids)) + .group_by(DatasetEntry.id) + .all() + ) + cc_map = {eid: cc for eid, cc in comment_counts} - annotations = ann_by_comment.get(comment.id, []) - ann_list = [] - for a in annotations: - annotator = user_map.get(a.annotator_id) - ann_list.append( - { - "annotator_name": annotator.name if annotator else "", - "label": a.label, - "justificativa": a.justificativa, - } - ) + items = [] + for entry in entries: + ds = ds_map.get(entry.dataset_id) + annotations = ann_by_entry.get(entry.id, []) + ann_list = [ + { + "annotator_name": user_map.get(a.annotator_id, User()).name + if user_map.get(a.annotator_id) + else "", + "label": a.label, + "justificativa": a.justificativa, + } + for a in annotations + ] - conflict = conflict_map.get(comment.id) + conflict = conflict_map.get(entry.id) items.append( { - "comment_db_id": comment.id, - "text_original": comment.text_original, - "author_display_name": comment.author_display_name, - "author_channel_id": comment.author_channel_id, + "entry_id": entry.id, + "author_display_name": entry.author_display_name, + "author_channel_id": entry.author_channel_id, + "comment_count": cc_map.get(entry.id, 0), "dataset_id": ds.id if ds else None, "dataset_name": ds.name if ds else "", "annotations": ann_list, @@ -473,10 +435,9 @@ def get_stats(db: Session) -> dict: .scalar() ) - # Total users flagged as bot by at least one annotator + # Total de usuários flagados como bot por pelo menos um anotador bots_flagged = ( - db.query(func.count(func.distinct(Comment.author_channel_id))) - .join(Annotation, Annotation.comment_id == Comment.id) + db.query(func.count(func.distinct(Annotation.dataset_entry_id))) .filter(Annotation.label == "bot") .scalar() ) @@ -497,10 +458,7 @@ def export_review_json( dataset_id: uuid.UUID, ): """Gerador de JSON streaming com dataset final (anotado + desempatado).""" - ds_query = db.query(Dataset) - if dataset_id: - ds_query = ds_query.filter(Dataset.id == dataset_id) - dataset = ds_query.first() + dataset = db.query(Dataset).filter(Dataset.id == dataset_id).first() if not dataset: yield '{"error": "Dataset não encontrado."}\n' @@ -520,82 +478,67 @@ def export_review_json( yield f' "dataset_name": {json.dumps(meta["dataset_name"])},\n' yield f' "video_id": {json.dumps(meta["video_id"])},\n' yield f' "exported_at": {json.dumps(meta["exported_at"])},\n' - yield ' "comments": [\n' + yield ' "users": [\n' entries = db.query(DatasetEntry).filter(DatasetEntry.dataset_id == dataset.id).all() - first_comment = True + first_entry = True for entry in entries: - comments = ( - db.query(Comment) - .filter( - Comment.collection_id == dataset.collection_id, - Comment.author_channel_id == entry.author_channel_id, - ) - .yield_per(500) + # Anotações para este entry (usuário) + annotations = ( + db.query(Annotation).filter(Annotation.dataset_entry_id == entry.id).all() ) + if not annotations: + continue - for comment in comments: - # Get all annotations for this comment - annotations = ( - db.query(Annotation).filter(Annotation.comment_id == comment.id).all() - ) - if not annotations: - continue - - # Check for conflict/resolution - conflict = ( - db.query(AnnotationConflict) - .filter(AnnotationConflict.comment_id == comment.id) - .first() - ) + # Conflito/resolução + conflict = ( + db.query(AnnotationConflict) + .filter(AnnotationConflict.dataset_entry_id == entry.id) + .first() + ) - # Determine final_label - resolution_data = None - if conflict and conflict.status == "resolved": - resolver = ( - db.query(User).filter(User.id == conflict.resolved_by).first() - ) - resolution_data = { - "resolved_by": resolver.name if resolver else "", - "resolved_label": conflict.resolved_label, - "resolved_at": conflict.resolved_at.isoformat() + "Z" - if conflict.resolved_at - else None, - } - final_label = conflict.resolved_label - else: - # No conflict or pending — use consensus label - labels = {a.label for a in annotations} - if len(labels) == 1: - final_label = labels.pop() - else: - final_label = "pending" - - ann_list = [] - for a in annotations: - annotator = db.query(User).filter(User.id == a.annotator_id).first() - ann_list.append( - { - "annotator": annotator.name if annotator else "", - "label": a.label, - "justificativa": a.justificativa, - } - ) - - item = { - "comment_db_id": str(comment.id), - "author_channel_id": comment.author_channel_id, - "author_display_name": comment.author_display_name, - "text_original": comment.text_original, - "final_label": final_label, - "annotations": ann_list, - "resolution": resolution_data, + resolution_data = None + if conflict and conflict.status == "resolved": + resolver = db.query(User).filter(User.id == conflict.resolved_by).first() + resolution_data = { + "resolved_by": resolver.name if resolver else "", + "resolved_label": conflict.resolved_label, + "resolved_at": conflict.resolved_at.isoformat() + "Z" + if conflict.resolved_at + else None, } + final_label = conflict.resolved_label + else: + labels = {a.label for a in annotations} + if len(labels) == 1: + final_label = labels.pop() + else: + final_label = "pending" - prefix = " " if first_comment else ",\n " - first_comment = False - yield prefix + json.dumps(item, ensure_ascii=False) + ann_list = [] + for a in annotations: + annotator = db.query(User).filter(User.id == a.annotator_id).first() + ann_list.append( + { + "annotator": annotator.name if annotator else "", + "label": a.label, + "justificativa": a.justificativa, + } + ) + + item = { + "entry_id": str(entry.id), + "author_channel_id": entry.author_channel_id, + "author_display_name": entry.author_display_name, + "final_label": final_label, + "annotations": ann_list, + "resolution": resolution_data, + } + + prefix = " " if first_entry else ",\n " + first_entry = False + yield prefix + json.dumps(item, ensure_ascii=False) yield "\n ]\n}\n" @@ -611,82 +554,66 @@ def export_review_csv( import csv import io - ds_query = db.query(Dataset) - if dataset_id: - ds_query = ds_query.filter(Dataset.id == dataset_id) - dataset = ds_query.first() + dataset = db.query(Dataset).filter(Dataset.id == dataset_id).first() if not dataset: yield "error\nDataset não encontrado.\n" return - header = "comment_db_id,author_channel_id,author_display_name," - header += "text_original,final_label\n" - yield header + yield "entry_id,author_channel_id,author_display_name,final_label\n" entries = db.query(DatasetEntry).filter(DatasetEntry.dataset_id == dataset.id).all() for entry in entries: - comments = ( - db.query(Comment) - .filter( - Comment.collection_id == dataset.collection_id, - Comment.author_channel_id == entry.author_channel_id, - ) - .yield_per(500) + annotations = ( + db.query(Annotation).filter(Annotation.dataset_entry_id == entry.id).all() ) + if not annotations: + continue - for comment in comments: - annotations = ( - db.query(Annotation).filter(Annotation.comment_id == comment.id).all() - ) - if not annotations: - continue - - conflict = ( - db.query(AnnotationConflict) - .filter(AnnotationConflict.comment_id == comment.id) - .first() - ) + conflict = ( + db.query(AnnotationConflict) + .filter(AnnotationConflict.dataset_entry_id == entry.id) + .first() + ) - if conflict and conflict.status == "resolved": - final_label = conflict.resolved_label - else: - labels = {a.label for a in annotations} - final_label = labels.pop() if len(labels) == 1 else "pending" - - buf = io.StringIO() - writer = csv.writer(buf) - writer.writerow( - [ - str(comment.id), - comment.author_channel_id, - comment.author_display_name, - comment.text_original, - final_label, - ] - ) - yield buf.getvalue() + if conflict and conflict.status == "resolved": + final_label = conflict.resolved_label + else: + labels = {a.label for a in annotations} + final_label = labels.pop() if len(labels) == 1 else "pending" + + buf = io.StringIO() + writer = csv.writer(buf) + writer.writerow( + [ + str(entry.id), + entry.author_channel_id, + entry.author_display_name, + final_label, + ] + ) + yield buf.getvalue() # ─── Import (lógica interna) ───────────────────────────────────────────────── -def _resolve_comments( +def _resolve_users( db: Session, admin_id: uuid.UUID, - comments: list, + users: list, ) -> dict: - """Resolve conflitos a partir de uma lista de comentários importados.""" + """Resolve conflitos a partir de uma lista de usuários importados.""" imported = 0 skipped = 0 errors = [] - for item in comments: - comment = db.query(Comment).filter(Comment.id == item.comment_db_id).first() - if not comment: + for item in users: + entry = db.query(DatasetEntry).filter(DatasetEntry.id == item.entry_id).first() + if not entry: skipped += 1 - errors.append(f"Comentário {item.comment_db_id} não encontrado.") + errors.append(f"Entrada {item.entry_id} não encontrada.") continue if not item.resolution: @@ -695,14 +622,12 @@ def _resolve_comments( conflict = ( db.query(AnnotationConflict) - .filter(AnnotationConflict.comment_id == comment.id) + .filter(AnnotationConflict.dataset_entry_id == entry.id) .first() ) if not conflict: skipped += 1 - errors.append( - f"Comentário {item.comment_db_id} não possui conflito registrado." - ) + errors.append(f"Entrada {item.entry_id} não possui conflito registrado.") continue if conflict.status == "resolved": @@ -713,7 +638,7 @@ def _resolve_comments( if resolved_label not in ("bot", "humano"): skipped += 1 errors.append( - f"Comentário {item.comment_db_id}: label '{resolved_label}' inválido." + f"Entrada {item.entry_id}: label '{resolved_label}' inválido." ) continue @@ -754,7 +679,7 @@ def import_review( db: Session, admin_id: uuid.UUID, video_id: str, - comments: list, + users: list, ) -> dict: """Importa dataset revisado (formato simétrico ao export).""" collection = db.query(Collection).filter(Collection.video_id == video_id).first() @@ -764,19 +689,19 @@ def import_review( detail=f"Coleta com video_id '{video_id}' não encontrada.", ) - return _resolve_comments(db, admin_id, comments) + return _resolve_users(db, admin_id, users) def import_review_chunk( db: Session, admin_id: uuid.UUID, - comments: list, + users: list, done: bool, ) -> dict: - """Batch adicional de comentários revisados para import paginado.""" - result = _resolve_comments(db, admin_id, comments) + """Batch adicional de usuários revisados para import paginado.""" + result = _resolve_users(db, admin_id, users) return { "total_imported": result["imported"], - "chunk_received": len(comments), + "chunk_received": len(users), "done": done, } diff --git a/backend/services/seed.py b/backend/services/seed.py index 6794263..34de050 100644 --- a/backend/services/seed.py +++ b/backend/services/seed.py @@ -146,16 +146,18 @@ def delete_seed(db: Session) -> dict: detail="Nenhum dado mockado encontrado.", ) - # Deletar anotações e conflitos dos comentários desta coleta - comment_ids = ( - db.query(Comment.id).filter(Comment.collection_id == collection.id).subquery() + # Deletar anotações e conflitos dos entries desta coleta + datasets = db.query(Dataset).filter(Dataset.collection_id == collection.id).all() + ds_ids = [ds.id for ds in datasets] + entry_ids = ( + db.query(DatasetEntry.id).filter(DatasetEntry.dataset_id.in_(ds_ids)).subquery() ) db.query(AnnotationConflict).filter( - AnnotationConflict.comment_id.in_(comment_ids) + AnnotationConflict.dataset_entry_id.in_(entry_ids) ).delete(synchronize_session=False) - db.query(Annotation).filter(Annotation.comment_id.in_(comment_ids)).delete( + db.query(Annotation).filter(Annotation.dataset_entry_id.in_(entry_ids)).delete( synchronize_session=False ) @@ -360,38 +362,37 @@ def run_seed(db: Session) -> dict: db.flush() # ─── Anotações pré-existentes ──────────────────────────────────────── - # Pegar todos os comentários dos bots no dataset + # Anotar por entry (usuário), não por comentário bot_channel_ids = [cid for cid, _, _ in BOTS] - all_bot_comments = ( - db.query(Comment) + bot_entries = ( + db.query(DatasetEntry) .filter( - Comment.collection_id == col.id, - Comment.author_channel_id.in_(bot_channel_ids), + DatasetEntry.dataset_id == ds.id, + DatasetEntry.author_channel_id.in_(bot_channel_ids), ) - .order_by(Comment.published_at) + .order_by(DatasetEntry.author_display_name) .all() ) annotations_created = 0 conflicts_created = 0 - for idx, comment in enumerate(all_bot_comments): + for idx, entry in enumerate(bot_entries): # Primeiros 40%: ambos concordam → bot (sem conflito) # Próximos 20%: ambos concordam → humano (sem conflito) # Próximos 20%: divergem → conflito (A=bot, B=humano) # Últimos 20%: sem anotação (pendentes) - ratio = idx / len(all_bot_comments) + ratio = idx / len(bot_entries) if ratio < 0.4: - # Concordância: ambos dizem bot ann_a = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=user_a.id, label="bot", justificativa="Texto de spam/promoção.", ) ann_b = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=user_b.id, label="bot", justificativa="Comentário promocional repetido.", @@ -401,14 +402,13 @@ def run_seed(db: Session) -> dict: annotations_created += 2 elif ratio < 0.6: - # Concordância: ambos dizem humano ann_a = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=user_a.id, label="humano", ) ann_b = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=user_b.id, label="humano", ) @@ -417,9 +417,8 @@ def run_seed(db: Session) -> dict: annotations_created += 2 elif ratio < 0.8: - # Divergência → conflito ann_a = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=user_a.id, label="bot", justificativa="Padrão de engajamento falso.", @@ -428,7 +427,7 @@ def run_seed(db: Session) -> dict: db.flush() ann_b = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=user_b.id, label="humano", ) @@ -436,7 +435,7 @@ def run_seed(db: Session) -> dict: db.flush() conflict = AnnotationConflict( - comment_id=comment.id, + dataset_entry_id=entry.id, annotation_a_id=ann_a.id, annotation_b_id=ann_b.id, status="pending", diff --git a/backend/tests/test_annotate.py b/backend/tests/test_annotate.py index 270cd81..dd2f771 100644 --- a/backend/tests/test_annotate.py +++ b/backend/tests/test_annotate.py @@ -60,6 +60,7 @@ def _make_dataset(db, collection_id, user_id, author_channel_ids): db.add(ds) db.flush() + entries = [] for channel_id in author_channel_ids: entry = DatasetEntry( dataset_id=ds.id, @@ -69,8 +70,9 @@ def _make_dataset(db, collection_id, user_id, author_channel_ids): matched_criteria=["percentil"], ) db.add(entry) + entries.append(entry) db.flush() - return ds + return ds, entries @pytest.fixture @@ -89,15 +91,16 @@ def second_user(db): @pytest.fixture def setup_data(db, regular_user): - """Cria coleta, comentários e dataset para testes de anotação.""" + """Cria coleta, comentários, dataset e entries para testes de anotação.""" col = _make_collection(db, regular_user.id) comments = _make_comments(db, col.id, "UC_author1", count=3) - ds = _make_dataset(db, col.id, regular_user.id, ["UC_author1"]) + ds, entries = _make_dataset(db, col.id, regular_user.id, ["UC_author1"]) db.commit() return { "collection": col, "comments": comments, "dataset": ds, + "entry": entries[0], } @@ -108,11 +111,11 @@ def setup_data(db, regular_user): class TestAnnotationValidation: def test_bot_sem_justificativa_retorna_422(self, client, auth_as_user, setup_data): - comment = setup_data["comments"][0] + entry = setup_data["entry"] resp = client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "", }, @@ -120,11 +123,11 @@ def test_bot_sem_justificativa_retorna_422(self, client, auth_as_user, setup_dat assert resp.status_code == 422 def test_bot_com_justificativa_aceita(self, client, auth_as_user, setup_data): - comment = setup_data["comments"][0] + entry = setup_data["entry"] resp = client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "Texto repetido.", }, @@ -133,11 +136,11 @@ def test_bot_com_justificativa_aceita(self, client, auth_as_user, setup_data): assert resp.json()["label"] == "bot" def test_humano_sem_justificativa_aceita(self, client, auth_as_user, setup_data): - comment = setup_data["comments"][0] + entry = setup_data["entry"] resp = client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "humano", }, ) @@ -145,11 +148,11 @@ def test_humano_sem_justificativa_aceita(self, client, auth_as_user, setup_data) assert resp.json()["label"] == "humano" def test_label_invalido_retorna_422(self, client, auth_as_user, setup_data): - comment = setup_data["comments"][0] + entry = setup_data["entry"] resp = client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "incerto", }, ) @@ -163,13 +166,12 @@ def test_label_invalido_retorna_422(self, client, auth_as_user, setup_data): class TestAnnotationAuth: def test_sem_token_retorna_401(self, client, setup_data): - comment = setup_data["comments"][0] - # Limpar override de auth + entry = setup_data["entry"] app.dependency_overrides.pop(get_current_user, None) resp = client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "humano", }, ) @@ -189,12 +191,12 @@ def test_list_users_sem_token_retorna_401(self, client, setup_data): class TestUpsertAnnotation: def test_reannotation_altera_label(self, client, auth_as_user, setup_data): - comment = setup_data["comments"][0] + entry = setup_data["entry"] # Primeira anotação: humano resp1 = client.post( "/annotate", - json={"comment_db_id": str(comment.id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) assert resp1.status_code == 200 ann_id = resp1.json()["annotation_id"] @@ -203,21 +205,20 @@ def test_reannotation_altera_label(self, client, auth_as_user, setup_data): resp2 = client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "Mudei de ideia.", }, ) assert resp2.status_code == 200 - # Mesmo annotation_id (upsert, não duplicata) assert resp2.json()["annotation_id"] == ann_id assert resp2.json()["label"] == "bot" - def test_comentario_inexistente_retorna_404(self, client, auth_as_user): + def test_entry_inexistente_retorna_404(self, client, auth_as_user): resp = client.post( "/annotate", json={ - "comment_db_id": str(uuid.uuid4()), + "entry_id": str(uuid.uuid4()), "label": "humano", }, ) @@ -233,44 +234,40 @@ class TestConflictDetection: def test_labels_iguais_sem_conflito( self, client, db, auth_as_user, setup_data, second_user ): - comment = setup_data["comments"][0] + entry = setup_data["entry"] - # User A anota humano client.post( "/annotate", - json={"comment_db_id": str(comment.id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) - # User B anota humano app.dependency_overrides[get_current_user] = lambda: second_user client.post( "/annotate", - json={"comment_db_id": str(comment.id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) conflicts = ( - db.query(AnnotationConflict).filter_by(comment_id=comment.id).count() + db.query(AnnotationConflict).filter_by(dataset_entry_id=entry.id).count() ) assert conflicts == 0 def test_labels_diferentes_cria_conflito( self, client, db, auth_as_user, setup_data, second_user ): - comment = setup_data["comments"][0] + entry = setup_data["entry"] - # User A anota humano resp1 = client.post( "/annotate", - json={"comment_db_id": str(comment.id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) assert resp1.json()["conflict_created"] is False - # User B anota bot app.dependency_overrides[get_current_user] = lambda: second_user resp2 = client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "Spam.", }, @@ -278,77 +275,71 @@ def test_labels_diferentes_cria_conflito( assert resp2.json()["conflict_created"] is True conflicts = ( - db.query(AnnotationConflict).filter_by(comment_id=comment.id).count() + db.query(AnnotationConflict).filter_by(dataset_entry_id=entry.id).count() ) assert conflicts == 1 - def test_segundo_conflito_mesmo_comentario_nao_duplica( + def test_segundo_conflito_mesmo_entry_nao_duplica( self, client, db, auth_as_user, setup_data, second_user ): - comment = setup_data["comments"][0] + entry = setup_data["entry"] - # User A anota humano client.post( "/annotate", - json={"comment_db_id": str(comment.id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) - # User B anota bot → conflito app.dependency_overrides[get_current_user] = lambda: second_user client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "Spam.", }, ) - # User B reanota bot (mantém divergência) → NÃO duplica conflito client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "Spam confirmado.", }, ) conflicts = ( - db.query(AnnotationConflict).filter_by(comment_id=comment.id).count() + db.query(AnnotationConflict).filter_by(dataset_entry_id=entry.id).count() ) assert conflicts == 1 def test_concordancia_apos_conflito_remove_conflito( self, client, db, auth_as_user, setup_data, second_user ): - comment = setup_data["comments"][0] + entry = setup_data["entry"] - # User A anota humano client.post( "/annotate", - json={"comment_db_id": str(comment.id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) - # User B anota bot → conflito app.dependency_overrides[get_current_user] = lambda: second_user client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "Spam.", }, ) - # User B muda para humano → concordam → conflito removido client.post( "/annotate", - json={"comment_db_id": str(comment.id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) conflicts = ( - db.query(AnnotationConflict).filter_by(comment_id=comment.id).count() + db.query(AnnotationConflict).filter_by(dataset_entry_id=entry.id).count() ) assert conflicts == 0 @@ -361,12 +352,12 @@ def test_concordancia_apos_conflito_remove_conflito( class TestListDatasetUsers: def test_lista_usuarios_com_progresso(self, client, auth_as_user, setup_data): ds = setup_data["dataset"] - comment = setup_data["comments"][0] + entry = setup_data["entry"] - # Anotar um comentário + # Anotar o entry client.post( "/annotate", - json={"comment_db_id": str(comment.id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) resp = client.get(f"/annotate/users?dataset_id={ds.id}") @@ -375,11 +366,11 @@ def test_lista_usuarios_com_progresso(self, client, auth_as_user, setup_data): assert data["dataset_id"] == str(ds.id) assert data["total_users"] == 1 - assert data["annotated_comments_by_me"] == 1 + assert data["annotated_users_by_me"] == 1 item = data["items"][0] - assert item["my_annotated_count"] == 1 - assert item["my_pending_count"] == 2 # 3 comments - 1 annotated + assert item["is_annotated_by_me"] is True + assert item["my_label"] == "humano" def test_dataset_inexistente_retorna_404(self, client, auth_as_user): resp = client.get(f"/annotate/users?dataset_id={uuid.uuid4()}") @@ -395,27 +386,23 @@ class TestGetEntryComments: def test_retorna_comentarios_com_anotacao( self, client, db, auth_as_user, setup_data ): - ds = setup_data["dataset"] - comment = setup_data["comments"][0] + entry = setup_data["entry"] - # Anotar + # Anotar o entry client.post( "/annotate", - json={"comment_db_id": str(comment.id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) - entry = db.query(DatasetEntry).filter_by(dataset_id=ds.id).first() resp = client.get(f"/annotate/comments/{entry.id}") assert resp.status_code == 200 data = resp.json() assert data["author_display_name"] == "User UC_author1" assert len(data["comments"]) == 3 - - # Primeiro comentário deve ter anotação - annotated = [c for c in data["comments"] if c["my_annotation"] is not None] - assert len(annotated) == 1 - assert annotated[0]["my_annotation"]["label"] == "humano" + # Anotação está no nível do entry, não do comment + assert data["my_annotation"] is not None + assert data["my_annotation"]["label"] == "humano" def test_entry_inexistente_retorna_404(self, client, auth_as_user): resp = client.get(f"/annotate/comments/{uuid.uuid4()}") @@ -432,17 +419,15 @@ def test_progresso_vazio_sem_anotacoes(self, client, auth_as_user, setup_data): resp = client.get("/annotate/my-progress") assert resp.status_code == 200 data = resp.json() - # Dataset existe mas pode ter 0 anotações if len(data) > 0: assert data[0]["annotated"] == 0 def test_progresso_atualiza_apos_anotacao(self, client, auth_as_user, setup_data): - comment = setup_data["comments"][0] + entry = setup_data["entry"] - # Anotar client.post( "/annotate", - json={"comment_db_id": str(comment.id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) resp = client.get("/annotate/my-progress") @@ -463,18 +448,14 @@ def test_progresso_atualiza_apos_anotacao(self, client, auth_as_user, setup_data class TestImport: def test_import_cria_anotacoes(self, client, auth_as_user, setup_data): - comments = setup_data["comments"] + entry = setup_data["entry"] resp = client.post( "/annotate/import", json={ "annotations": [ { - "comment_db_id": str(comments[0].id), - "label": "humano", - }, - { - "comment_db_id": str(comments[1].id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "Spam.", }, @@ -483,30 +464,28 @@ def test_import_cria_anotacoes(self, client, auth_as_user, setup_data): ) assert resp.status_code == 200 data = resp.json() - assert data["imported"] == 2 + assert data["imported"] == 1 assert data["updated"] == 0 assert data["skipped"] == 0 def test_import_upsert_nao_duplica(self, client, auth_as_user, setup_data): - comment = setup_data["comments"][0] + entry = setup_data["entry"] - # Primeira vez client.post( "/annotate/import", json={ "annotations": [ - {"comment_db_id": str(comment.id), "label": "humano"}, + {"entry_id": str(entry.id), "label": "humano"}, ], }, ) - # Segunda vez — upsert resp = client.post( "/annotate/import", json={ "annotations": [ { - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "Mudei de ideia.", }, @@ -517,13 +496,13 @@ def test_import_upsert_nao_duplica(self, client, auth_as_user, setup_data): assert data["imported"] == 0 assert data["updated"] == 1 - def test_import_comentario_inexistente_skip(self, client, auth_as_user): + def test_import_entry_inexistente_skip(self, client, auth_as_user): resp = client.post( "/annotate/import", json={ "annotations": [ { - "comment_db_id": str(uuid.uuid4()), + "entry_id": str(uuid.uuid4()), "label": "humano", }, ], @@ -534,13 +513,13 @@ def test_import_comentario_inexistente_skip(self, client, auth_as_user): assert len(data["errors"]) == 1 def test_import_bot_sem_justificativa_skip(self, client, auth_as_user, setup_data): - comment = setup_data["comments"][0] + entry = setup_data["entry"] resp = client.post( "/annotate/import", json={ "annotations": [ { - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "", }, @@ -560,59 +539,47 @@ class TestExport: def test_export_json_retorna_apenas_minhas_anotacoes( self, client, db, auth_as_user, setup_data, second_user ): - comments = setup_data["comments"] - - # User A anota comment 0 - client.post( - "/annotate", - json={"comment_db_id": str(comments[0].id), "label": "humano"}, - ) + entry = setup_data["entry"] - # User B anota comment 1 - app.dependency_overrides[get_current_user] = lambda: second_user + # User A anota client.post( "/annotate", - json={"comment_db_id": str(comments[1].id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) - # User A exporta - app.dependency_overrides[get_current_user] = lambda: auth_as_user resp = client.get("/annotate/export?format=json") assert resp.status_code == 200 - - # Deve vir ao menos o comentário de User A data = resp.json() assert "annotations" in data def test_export_csv(self, client, auth_as_user, setup_data): - comment = setup_data["comments"][0] + entry = setup_data["entry"] client.post( "/annotate", - json={"comment_db_id": str(comment.id), "label": "humano"}, + json={"entry_id": str(entry.id), "label": "humano"}, ) resp = client.get("/annotate/export?format=csv") assert resp.status_code == 200 assert "text/csv" in resp.headers["content-type"] lines = resp.text.strip().split("\n") - assert lines[0] == "comment_db_id,label,justificativa" + assert lines[0] == "entry_id,author_channel_id,label,justificativa" assert len(lines) >= 2 # --------------------------------------------------------------------------- -# Testes adicionais — cobertura 100% +# Testes adicionais — cobertura # --------------------------------------------------------------------------- class TestAdminCannotAnnotate: def test_admin_post_annotate_retorna_403(self, client, auth_as_admin, setup_data): - """Admin nao pode anotar — apenas revisar conflitos.""" - comment = setup_data["comments"][0] + entry = setup_data["entry"] resp = client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "humano", }, ) @@ -630,13 +597,11 @@ def test_admin_ve_todas_anotacoes( regular_user, setup_data, ): - """Admin ve contagem de anotacoes de todos.""" from models.annotation import Annotation - comment = setup_data["comments"][0] - # Criar anotacao como regular_user diretamente no DB + entry = setup_data["entry"] ann = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=regular_user.id, label="humano", ) @@ -647,36 +612,29 @@ def test_admin_ve_todas_anotacoes( resp = client.get(f"/annotate/users?dataset_id={ds.id}") assert resp.status_code == 200 data = resp.json() - # Admin ve total_annotated global (1 anotacao) - assert data["annotated_comments_by_me"] == 1 + assert data["annotated_users_by_me"] == 1 class TestListDatasetUsersFilters: def test_only_pending_filter(self, client, auth_as_user, setup_data): - """Filtro only_pending retorna apenas usuarios pendentes.""" ds = setup_data["dataset"] - comment = setup_data["comments"][0] + entry = setup_data["entry"] - # Anotar 1 de 3 comentarios + # Anotar o entry client.post( "/annotate", - json={ - "comment_db_id": str(comment.id), - "label": "humano", - }, + json={"entry_id": str(entry.id), "label": "humano"}, ) - resp = client.get(f"/annotate/users?dataset_id={ds.id}" "&only_pending=true") + resp = client.get(f"/annotate/users?dataset_id={ds.id}&only_pending=true") assert resp.status_code == 200 data = resp.json() - # Ainda ha pendencias - for item in data["items"]: - assert item["my_pending_count"] > 0 + # O único entry foi anotado, então sem pendentes + assert data["total_users"] == 0 def test_pending_first_ordering(self, client, auth_as_user, setup_data): - """Filtro pending_first ordena por pendencias desc.""" ds = setup_data["dataset"] - resp = client.get(f"/annotate/users?dataset_id={ds.id}" "&pending_first=true") + resp = client.get(f"/annotate/users?dataset_id={ds.id}&pending_first=true") assert resp.status_code == 200 assert len(resp.json()["items"]) >= 1 @@ -691,69 +649,58 @@ def test_admin_ve_all_annotations( regular_user, setup_data, ): - """Admin ve all_annotations com nomes dos anotadores.""" from models.annotation import Annotation - comment = setup_data["comments"][0] + entry = setup_data["entry"] ann = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=regular_user.id, label="humano", ) db.add(ann) db.commit() - ds = setup_data["dataset"] - entry = db.query(DatasetEntry).filter_by(dataset_id=ds.id).first() resp = client.get(f"/annotate/comments/{entry.id}") assert resp.status_code == 200 data = resp.json() - annotated = [c for c in data["comments"] if c["all_annotations"] is not None] - assert len(annotated) >= 1 - first_ann = annotated[0]["all_annotations"][0] - assert "annotator_name" in first_ann + assert data["all_annotations"] is not None + assert len(data["all_annotations"]) >= 1 + assert "annotator_name" in data["all_annotations"][0] class TestConflictReopening: def test_reannotation_reabre_conflito_resolvido( self, client, db, auth_as_user, setup_data, second_user ): - """Re-anotacao apos resolucao reabre o conflito.""" - comment = setup_data["comments"][0] + entry = setup_data["entry"] - # User A anota humano client.post( "/annotate", - json={ - "comment_db_id": str(comment.id), - "label": "humano", - }, + json={"entry_id": str(entry.id), "label": "humano"}, ) - # User B anota bot -> conflito criado app.dependency_overrides[get_current_user] = lambda: second_user client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "Spam.", }, ) - # Resolver conflito manualmente no banco - conflict = db.query(AnnotationConflict).filter_by(comment_id=comment.id).first() + conflict = ( + db.query(AnnotationConflict).filter_by(dataset_entry_id=entry.id).first() + ) conflict.status = "resolved" conflict.resolved_label = "bot" db.commit() - # User B re-anota bot (mesma label, mas A=humano) - # Labels divergem: A=humano vs B=bot -> reabre resp = client.post( "/annotate", json={ - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "label": "bot", "justificativa": "Confirmo spam.", }, @@ -776,12 +723,11 @@ def test_all_progress_retorna_dados_por_anotador( regular_user, setup_data, ): - """Admin ve progresso de todos os anotadores.""" from models.annotation import Annotation - comment = setup_data["comments"][0] + entry = setup_data["entry"] ann = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=regular_user.id, label="humano", ) @@ -791,35 +737,31 @@ def test_all_progress_retorna_dados_por_anotador( resp = client.get("/annotate/all-progress") assert resp.status_code == 200 data = resp.json() - # Deve conter pelo menos o regular_user assert any(p["annotator_name"] == "Usuário Teste" for p in data) def test_all_progress_exclui_admin( self, client, db, auth_as_admin, admin_user, setup_data ): - """Admin nao aparece como anotador no all-progress.""" resp = client.get("/annotate/all-progress") assert resp.status_code == 200 data = resp.json() - for entry in data: - assert entry["annotator_name"] != admin_user.name + for p in data: + assert p["annotator_name"] != admin_user.name def test_all_progress_requer_admin(self, client, auth_as_user): - """Endpoint all-progress requer role admin.""" resp = client.get("/annotate/all-progress") assert resp.status_code == 403 class TestImportAnnotationsChunk: def test_import_chunk_retorna_totais(self, client, auth_as_user, setup_data): - """Import-chunk retorna contadores do batch.""" - comments = setup_data["comments"] + entry = setup_data["entry"] resp = client.post( "/annotate/import-chunk", json={ "annotations": [ { - "comment_db_id": str(comments[0].id), + "entry_id": str(entry.id), "label": "humano", }, ], @@ -835,19 +777,15 @@ def test_import_chunk_retorna_totais(self, client, auth_as_user, setup_data): class TestExportWithDatasetFilter: def test_export_json_com_dataset_id(self, client, auth_as_user, setup_data): - """Export JSON filtrado por dataset_id inclui metadados.""" - comment = setup_data["comments"][0] + entry = setup_data["entry"] ds = setup_data["dataset"] client.post( "/annotate", - json={ - "comment_db_id": str(comment.id), - "label": "humano", - }, + json={"entry_id": str(entry.id), "label": "humano"}, ) - resp = client.get(f"/annotate/export?format=json" f"&dataset_id={ds.id}") + resp = client.get(f"/annotate/export?format=json&dataset_id={ds.id}") assert resp.status_code == 200 data = resp.json() assert "dataset_id" in data @@ -856,32 +794,23 @@ def test_export_json_com_dataset_id(self, client, auth_as_user, setup_data): assert "annotations" in data def test_export_csv_com_dataset_id(self, client, auth_as_user, setup_data): - """Export CSV filtrado por dataset_id funciona.""" - comment = setup_data["comments"][0] + entry = setup_data["entry"] ds = setup_data["dataset"] client.post( "/annotate", - json={ - "comment_db_id": str(comment.id), - "label": "humano", - }, + json={"entry_id": str(entry.id), "label": "humano"}, ) - resp = client.get(f"/annotate/export?format=csv" f"&dataset_id={ds.id}") + resp = client.get(f"/annotate/export?format=csv&dataset_id={ds.id}") assert resp.status_code == 200 assert "text/csv" in resp.headers["content-type"] lines = resp.text.strip().split("\n") - assert lines[0] == "comment_db_id,label,justificativa" - - -# ------------------------------------------------------------------- -# Cobertura adicional — get_all_progress total_comments == 0 -# ------------------------------------------------------------------- + assert lines[0] == "entry_id,author_channel_id,label,justificativa" class TestGetAllProgressZeroComments: - def test_dataset_sem_comments_pulado_no_all_progress( + def test_dataset_sem_entries_pulado_no_all_progress( self, client, db, @@ -889,9 +818,7 @@ def test_dataset_sem_comments_pulado_no_all_progress( admin_user, regular_user, ): - """Dataset com 0 comentarios e ignorado no all-progress.""" col = _make_collection(db, admin_user.id, video_id="vid_empty_prog") - # Dataset com entry mas sem comentarios reais ds = Dataset( name=f"empty_ap_{uuid.uuid4().hex[:6]}", collection_id=col.id, @@ -907,6 +834,5 @@ def test_dataset_sem_comments_pulado_no_all_progress( resp = client.get("/annotate/all-progress") assert resp.status_code == 200 data = resp.json() - # Dataset sem comments nao aparece ds_ids = [p["dataset_id"] for p in data] assert str(ds.id) not in ds_ids diff --git a/backend/tests/test_dashboard.py b/backend/tests/test_dashboard.py index ad6dacd..8e3786b 100644 --- a/backend/tests/test_dashboard.py +++ b/backend/tests/test_dashboard.py @@ -63,6 +63,7 @@ def _make_dataset( ) db.add(ds) db.flush() + entries = [] for channel_id in author_channel_ids: entry = DatasetEntry( dataset_id=ds.id, @@ -72,13 +73,14 @@ def _make_dataset( matched_criteria=criteria, ) db.add(entry) + entries.append(entry) db.flush() - return ds + return ds, entries -def _annotate(db, comment, user, label, justificativa=None): +def _annotate(db, entry, user, label, justificativa=None): ann = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=user.id, label=label, justificativa=justificativa, @@ -91,7 +93,7 @@ def _annotate(db, comment, user, label, justificativa=None): def _make_conflict( db, - comment, + entry, ann_a, ann_b, *, @@ -100,7 +102,7 @@ def _make_conflict( status="pending", ): conflict = AnnotationConflict( - comment_id=comment.id, + dataset_entry_id=entry.id, annotation_a_id=ann_a.id, annotation_b_id=ann_b.id, status=status, @@ -121,16 +123,17 @@ def _assert_valid_plotly_json(chart_json: str): def _populate_full_scenario(db, user_a, user_b): - """Cria cenário completo: 2 vídeos, 3 datasets, anotações, conflitos. + """Cria cenário completo: 2 vídeos, 3 datasets, anotações por entry. + Unidade de anotação: entry (autor/canal do YouTube). Retorna dict com referências para assertions nos testes. """ # Vídeo 1 — 2 datasets com critérios diferentes col1 = _make_collection(db, user_a.id, video_id="vid_alpha") - comments_a1 = _make_comments(db, col1.id, "author_a", count=3) - comments_b1 = _make_comments(db, col1.id, "author_b", count=2) + _make_comments(db, col1.id, "author_a", count=3) + _make_comments(db, col1.id, "author_b", count=2) - ds1 = _make_dataset( + ds1, entries1 = _make_dataset( db, col1.id, user_a.id, @@ -138,7 +141,7 @@ def _populate_full_scenario(db, user_a, user_b): criteria=["percentil"], name="alpha_percentil", ) - ds2 = _make_dataset( + ds2, entries2 = _make_dataset( db, col1.id, user_a.id, @@ -147,63 +150,52 @@ def _populate_full_scenario(db, user_a, user_b): name="alpha_media_curtos", ) - # Vídeo 2 — 1 dataset + # Vídeo 2 — 1 dataset com 2 autores col2 = _make_collection(db, user_a.id, video_id="vid_beta") - comments_c2 = _make_comments(db, col2.id, "author_c", count=4) - ds3 = _make_dataset( + _make_comments(db, col2.id, "author_c", count=4) + _make_comments(db, col2.id, "author_d", count=2) + ds3, entries3 = _make_dataset( db, col2.id, user_a.id, - ["author_c"], + ["author_c", "author_d"], criteria=["percentil", "identicos"], name="beta_percentil_identicos", ) - # ── Anotações do vídeo 1 ── + entry_a = entries1[0] # author_a + entry_b = entries2[0] # author_b + entry_c = entries3[0] # author_c + entry_d = entries3[1] # author_d - # ds1: author_a → 3 comentários - # comment_a1[0]: consenso bot (ambos dizem bot) - _annotate(db, comments_a1[0], user_a, "bot", "spam") - _annotate(db, comments_a1[0], user_b, "bot", "concordo") + # ── Anotações por entry (usuario) ── - # comment_a1[1]: consenso humano - _annotate(db, comments_a1[1], user_a, "humano") - _annotate(db, comments_a1[1], user_b, "humano") + # entry_a (author_a): conflito pendente (A=bot, B=humano) + ann_ea_a = _annotate(db, entry_a, user_a, "bot", "suspeito") + ann_ea_b = _annotate(db, entry_a, user_b, "humano") + _make_conflict(db, entry_a, ann_ea_a, ann_ea_b, status="pending") - # comment_a1[2]: conflito pendente - ann_a2_a = _annotate(db, comments_a1[2], user_a, "bot", "suspeito") - ann_a2_b = _annotate(db, comments_a1[2], user_b, "humano") - _make_conflict(db, comments_a1[2], ann_a2_a, ann_a2_b, status="pending") - - # ds2: author_b → 2 comentários - # comment_b1[0]: conflito resolvido como bot - ann_b0_a = _annotate(db, comments_b1[0], user_a, "bot", "repetitivo") - ann_b0_b = _annotate(db, comments_b1[0], user_b, "humano") + # entry_b (author_b): conflito resolvido como bot + ann_eb_a = _annotate(db, entry_b, user_a, "bot", "repetitivo") + ann_eb_b = _annotate(db, entry_b, user_b, "humano") _make_conflict( db, - comments_b1[0], - ann_b0_a, - ann_b0_b, + entry_b, + ann_eb_a, + ann_eb_b, status="resolved", resolved_by=user_a.id, resolved_label="bot", ) - # comment_b1[1]: apenas user_a anotou (1 anotação) - _annotate(db, comments_b1[1], user_a, "humano") - - # ── Anotações do vídeo 2 ── + # entry_c (author_c): consenso humano + _annotate(db, entry_c, user_a, "humano") + _annotate(db, entry_c, user_b, "humano") - # ds3: author_c → 4 comentários - # comment_c2[0]: consenso humano - _annotate(db, comments_c2[0], user_a, "humano") - _annotate(db, comments_c2[0], user_b, "humano") + # entry_d (author_d): consenso bot + _annotate(db, entry_d, user_a, "bot", "bot óbvio") + _annotate(db, entry_d, user_b, "bot", "concordo") - # comment_c2[1]: consenso bot - _annotate(db, comments_c2[1], user_a, "bot", "bot óbvio") - _annotate(db, comments_c2[1], user_b, "bot", "concordo") - - # comment_c2[2] e [3]: sem anotação db.commit() return { @@ -212,9 +204,10 @@ def _populate_full_scenario(db, user_a, user_b): "ds1": ds1, "ds2": ds2, "ds3": ds3, - "comments_a1": comments_a1, - "comments_b1": comments_b1, - "comments_c2": comments_c2, + "entry_a": entry_a, + "entry_b": entry_b, + "entry_c": entry_c, + "entry_d": entry_d, } @@ -234,17 +227,16 @@ def test_retorna_kpis_e_charts_validos(self, client, db, auth_as_user, admin_use s = data["summary"] assert s["total_datasets"] == 3 - # bots: consenso bot em vid_alpha (author_a[0]) + resolvido bot (author_b[0]) - # + consenso bot em vid_beta (author_c[1]) = 3 - assert s["total_bots"] == 3 - # humanos: consenso humano (a[1]) + 1 anotação humano (b[1]) + consenso (c[0]) = 3 - assert s["total_humans"] == 3 - # conflitos totais: 2 (pendente + resolvido) + # bots: resolvido bot (entry_b) + consenso bot (entry_d) = 2 + assert s["total_bots"] == 2 + # humanos: consenso humano (entry_c) = 1 + assert s["total_humans"] == 1 + # conflitos totais: 2 (pendente entry_a + resolvido entry_b) assert s["total_conflicts"] == 2 assert s["pending_conflicts"] == 1 # Progresso geral - assert s["total_comments_in_datasets"] > 0 + assert s["total_users_in_datasets"] == 4 assert 0 <= s["annotation_progress"] <= 100 # Charts válidos @@ -307,11 +299,13 @@ def test_agreement_rate_correto(self, client, db, auth_as_user, admin_user): data = resp.json() rate = data["summary"]["agreement_rate"] - # Com 2 anotações: - # vid_alpha: a[0]=consenso, a[1]=consenso, a[2]=conflito, b[0]=conflito → 2 consenso / 4 - # vid_beta: c[0]=consenso, c[1]=consenso → 2 consenso / 2 - # Total: 4 consenso / 6 = 0.6667 - assert rate == round(4 / 6, 4) + # Com 2 anotações (por entry): + # entry_a: conflito (diverge) → 0 + # entry_b: conflito (diverge) → 0 + # entry_c: consenso humano → 1 + # entry_d: consenso bot → 1 + # Total: 2 consenso / 4 = 0.5 + assert rate == round(2 / 4, 4) # --------------------------------------------------------------------------- @@ -334,7 +328,8 @@ def test_retorna_dados_do_video_filtrado( s = data["summary"] # vid_alpha tem 5 comentários coletados (3 de author_a + 2 de author_b) assert s["total_comments_collected"] == 5 - assert s["total_comments_in_datasets"] == 5 + # 2 entries (author_a + author_b) + assert s["total_users_in_datasets"] == 2 # Charts válidos for key in [ @@ -367,8 +362,8 @@ def test_video_com_filtro_criteria(self, client, db, auth_as_user, admin_user): assert resp.status_code == 200 data = resp.json() - # Apenas alpha_media_curtos tem ambos media e curtos - assert data["summary"]["total_comments_in_datasets"] == 2 + # Apenas alpha_media_curtos tem ambos media e curtos (1 entry: author_b) + assert data["summary"]["total_users_in_datasets"] == 1 # --------------------------------------------------------------------------- @@ -388,13 +383,12 @@ def test_retorna_dados_do_pesquisador_autenticado( data = resp.json() s = data["summary"] - # auth_as_user anotou: - # vid_alpha: a[0]=bot, a[1]=humano, a[2]=bot, b[0]=bot, b[1]=humano - # vid_beta: c[0]=humano, c[1]=bot - # Total: 7 anotados, bots=4, humans=3 - assert s["total_annotated"] == 7 - assert s["bots"] == 4 - assert s["humans"] == 3 + # auth_as_user (user_a) anotou por entry: + # entry_a=bot, entry_b=bot, entry_c=humano, entry_d=bot + # Total: 4 anotados, bots=3, humans=1 + assert s["total_annotated"] == 4 + assert s["bots"] == 3 + assert s["humans"] == 1 assert s["total_datasets_assigned"] == 3 assert len(data["datasets"]) == 3 @@ -438,20 +432,24 @@ def test_sem_dados_retorna_zeros(self, client, auth_as_user): def test_dataset_status_correto(self, client, db, auth_as_user, admin_user): """Verifica status completed, in_progress e not_started.""" col = _make_collection(db, auth_as_user.id, video_id="vid_status") - comments_a = _make_comments(db, col.id, "ch_a", count=2) - comments_b = _make_comments(db, col.id, "ch_b", count=2) + _make_comments(db, col.id, "ch_a", count=2) + _make_comments(db, col.id, "ch_b1", count=2) + _make_comments(db, col.id, "ch_b2", count=2) _make_comments(db, col.id, "ch_c", count=2) - _make_dataset(db, col.id, auth_as_user.id, ["ch_a"], name="ds_done") - _make_dataset(db, col.id, auth_as_user.id, ["ch_b"], name="ds_partial") + ds_done, entries_done = _make_dataset( + db, col.id, auth_as_user.id, ["ch_a"], name="ds_done" + ) + ds_partial, entries_partial = _make_dataset( + db, col.id, auth_as_user.id, ["ch_b1", "ch_b2"], name="ds_partial" + ) _make_dataset(db, col.id, auth_as_user.id, ["ch_c"], name="ds_empty") - # ds_done: anotar todos - for c in comments_a: - _annotate(db, c, auth_as_user, "humano") + # ds_done: anotar todos os entries (1 entry) + _annotate(db, entries_done[0], auth_as_user, "humano") - # ds_partial: anotar 1 de 2 - _annotate(db, comments_b[0], auth_as_user, "bot", "teste") + # ds_partial: anotar 1 de 2 entries + _annotate(db, entries_partial[0], auth_as_user, "bot", "teste") # ds_empty: nenhuma anotação db.commit() @@ -477,17 +475,13 @@ class TestBotComments: def test_retorna_bots_com_concordancia(self, client, db, auth_as_user, admin_user): """Tabela de bots retorna concordance_pct correto.""" col = _make_collection(db, auth_as_user.id, video_id="vid_bots") - comments = _make_comments(db, col.id, "ch_bot", count=2) - _make_dataset(db, col.id, auth_as_user.id, ["ch_bot"]) - - # Consenso bot no comment[0] - _annotate(db, comments[0], auth_as_user, "bot", "spam") - _annotate(db, comments[0], admin_user, "bot", "concordo") + _make_comments(db, col.id, "ch_bot", count=2) + ds, entries = _make_dataset(db, col.id, auth_as_user.id, ["ch_bot"]) + entry = entries[0] - # Conflito no comment[1] - ann_a = _annotate(db, comments[1], auth_as_user, "bot", "suspeito") - ann_b = _annotate(db, comments[1], admin_user, "humano") - _make_conflict(db, comments[1], ann_a, ann_b, status="pending") + # Consenso bot no entry + _annotate(db, entry, auth_as_user, "bot", "spam") + _annotate(db, entry, admin_user, "bot", "concordo") db.commit() resp = client.get("/dashboard/bots") @@ -503,31 +497,18 @@ def test_sem_token_retorna_401(self, client): resp = client.get("/dashboard/bots") assert resp.status_code == 401 - def test_filtro_por_search(self, client, db, auth_as_user, admin_user): - """Filtro de busca por texto do comentário.""" + def test_filtro_por_author(self, client, db, auth_as_user, admin_user): + """Filtro de busca por author_display_name.""" col = _make_collection(db, auth_as_user.id, video_id="vid_search") - c = Comment( - collection_id=col.id, - comment_id="unique_search_c1", - author_channel_id="ch_search", - author_display_name="Busca User", - text_original="TEXTO ÚNICO PARA BUSCA", - like_count=0, - reply_count=0, - published_at=datetime(2024, 1, 1), - updated_at=datetime(2024, 1, 1), - ) - db.add(c) - db.flush() - _make_dataset(db, col.id, auth_as_user.id, ["ch_search"]) - _annotate(db, c, auth_as_user, "bot", "teste busca") + _make_comments(db, col.id, "ch_search", count=1) + ds, entries = _make_dataset(db, col.id, auth_as_user.id, ["ch_search"]) + _annotate(db, entries[0], auth_as_user, "bot", "teste busca") db.commit() - resp = client.get("/dashboard/bots?search=ÚNICO PARA") + resp = client.get("/dashboard/bots?author=ch_search") assert resp.status_code == 200 data = resp.json() assert data["total"] >= 1 - assert any("ÚNICO" in i["text_original"] for i in data["items"]) # --------------------------------------------------------------------------- @@ -611,23 +592,22 @@ def test_video_nao_expoe_username(self, client, db, auth_as_user, admin_user): # --------------------------------------------------------------------------- -class TestClassifyCommentSingleLabel: +class TestClassifyEntrySingleLabel: def test_single_annotation_consensus(self, client, db, auth_as_user, admin_user): """1 anotacao apenas classifica pelo label unico.""" col = _make_collection(db, auth_as_user.id, video_id="vid_single") - comments = _make_comments(db, col.id, "ch_single", count=2) - _make_dataset( + _make_comments(db, col.id, "ch_single_a", count=1) + _make_comments(db, col.id, "ch_single_b", count=1) + ds, entries = _make_dataset( db, col.id, auth_as_user.id, - ["ch_single"], + ["ch_single_a", "ch_single_b"], name="ds_single", ) - # Apenas 1 anotador classifica como bot - _annotate(db, comments[0], auth_as_user, "bot", "unico") - # Apenas 1 anotador classifica como humano - _annotate(db, comments[1], auth_as_user, "humano") + _annotate(db, entries[0], auth_as_user, "bot", "unico") + _annotate(db, entries[1], auth_as_user, "humano") db.commit() resp = client.get("/dashboard/global") @@ -677,20 +657,20 @@ def test_dataset_with_no_comments_skipped(self, client, db, auth_as_user): assert data["summary"]["total_datasets_assigned"] == 0 -class TestBotCommentsFilters: +class TestBotUsersFilters: def test_filter_by_dataset_id(self, client, db, auth_as_user, admin_user): """Filtro por dataset_id retorna apenas bots daquele ds.""" col = _make_collection(db, auth_as_user.id, video_id="vid_filt_ds") - comments_a = _make_comments(db, col.id, "ch_fa", count=2) - comments_b = _make_comments(db, col.id, "ch_fb", count=2) - ds_a = _make_dataset( + _make_comments(db, col.id, "ch_fa", count=2) + _make_comments(db, col.id, "ch_fb", count=2) + ds_a, entries_a = _make_dataset( db, col.id, auth_as_user.id, ["ch_fa"], name="ds_filt_a", ) - _make_dataset( + ds_b, entries_b = _make_dataset( db, col.id, auth_as_user.id, @@ -698,8 +678,8 @@ def test_filter_by_dataset_id(self, client, db, auth_as_user, admin_user): name="ds_filt_b", ) - _annotate(db, comments_a[0], auth_as_user, "bot", "spam") - _annotate(db, comments_b[0], auth_as_user, "bot", "spam") + _annotate(db, entries_a[0], auth_as_user, "bot", "spam") + _annotate(db, entries_b[0], auth_as_user, "bot", "spam") db.commit() resp = client.get(f"/dashboard/bots?dataset_id={ds_a.id}") @@ -711,15 +691,15 @@ def test_filter_by_dataset_id(self, client, db, auth_as_user, admin_user): def test_filter_by_video_id(self, client, db, auth_as_user): """Filtro por video_id retorna bots daquele video.""" col = _make_collection(db, auth_as_user.id, video_id="vid_filt_vid") - comments = _make_comments(db, col.id, "ch_fv", count=2) - _make_dataset( + _make_comments(db, col.id, "ch_fv", count=2) + ds, entries = _make_dataset( db, col.id, auth_as_user.id, ["ch_fv"], name="ds_fv", ) - _annotate(db, comments[0], auth_as_user, "bot", "teste") + _annotate(db, entries[0], auth_as_user, "bot", "teste") db.commit() resp = client.get("/dashboard/bots?video_id=vid_filt_vid") @@ -729,15 +709,15 @@ def test_filter_by_video_id(self, client, db, auth_as_user): def test_filter_by_author(self, client, db, auth_as_user): """Filtro por author busca por display name.""" col = _make_collection(db, auth_as_user.id, video_id="vid_filt_auth") - comments = _make_comments(db, col.id, "ch_auth_filt", count=2) - _make_dataset( + _make_comments(db, col.id, "ch_auth_filt", count=2) + ds, entries = _make_dataset( db, col.id, auth_as_user.id, ["ch_auth_filt"], name="ds_auth_filt", ) - _annotate(db, comments[0], auth_as_user, "bot", "teste") + _annotate(db, entries[0], auth_as_user, "bot", "teste") db.commit() resp = client.get("/dashboard/bots?author=ch_auth_filt") @@ -747,8 +727,8 @@ def test_filter_by_author(self, client, db, auth_as_user): def test_filter_by_criteria(self, client, db, auth_as_user): """Filtro por criteria retorna bots de datasets com aquele criterio.""" col = _make_collection(db, auth_as_user.id, video_id="vid_filt_crit") - comments = _make_comments(db, col.id, "ch_crit", count=2) - _make_dataset( + _make_comments(db, col.id, "ch_crit", count=2) + ds, entries = _make_dataset( db, col.id, auth_as_user.id, @@ -756,7 +736,7 @@ def test_filter_by_criteria(self, client, db, auth_as_user): criteria=["intervalo"], name="ds_crit", ) - _annotate(db, comments[0], auth_as_user, "bot", "teste") + _annotate(db, entries[0], auth_as_user, "bot", "teste") db.commit() resp = client.get("/dashboard/bots?criteria=intervalo") @@ -871,7 +851,8 @@ def test_dataset_with_entries_but_no_matching_comments( resp = client.get("/dashboard/global") assert resp.status_code == 200 s = resp.json()["summary"] - assert s["total_comments_in_datasets"] == 0 + # Entry existe mas sem comentários — ainda conta como usuário no dataset + assert s["total_users_in_datasets"] == 1 class TestBotRateByCriteriaEmptyCids: @@ -906,14 +887,14 @@ def test_dataset_with_ghost_entries_skipped_in_bot_rate( _assert_valid_plotly_json(resp.json()["bot_rate_by_criteria_chart"]) -class TestClassifyCommentMultiLabelNoConflict: +class TestClassifyEntryMultiLabelNoConflict: def test_divergent_labels_without_conflict_returns_none( self, client, db, auth_as_user, admin_user ): """Anotacoes divergentes sem AnnotationConflict: classificacao None.""" col = _make_collection(db, auth_as_user.id, video_id="vid_noconf") - comments = _make_comments(db, col.id, "ch_noconf", count=1) - _make_dataset( + _make_comments(db, col.id, "ch_noconf", count=1) + ds, entries = _make_dataset( db, col.id, auth_as_user.id, @@ -923,13 +904,13 @@ def test_divergent_labels_without_conflict_returns_none( # Inserir anotacoes divergentes SEM conflito ann_a = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entries[0].id, annotator_id=auth_as_user.id, label="bot", justificativa="suspeito", ) ann_b = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entries[0].id, annotator_id=admin_user.id, label="humano", ) @@ -938,8 +919,7 @@ def test_divergent_labels_without_conflict_returns_none( resp = client.get("/dashboard/global") assert resp.status_code == 200 - # O comentario nao eh classificado (None) - # Total de bots e humanos nao inclui este caso + # O entry nao eh classificado (None) s = resp.json()["summary"] assert s["total_bots"] == 0 assert s["total_humans"] == 0 diff --git a/backend/tests/test_data.py b/backend/tests/test_data.py index ac73520..fe2af28 100644 --- a/backend/tests/test_data.py +++ b/backend/tests/test_data.py @@ -56,6 +56,7 @@ def _make_dataset(db, collection_id, user_id, author_channel_ids): ) db.add(ds) db.flush() + entries = [] for channel_id in author_channel_ids: entry = DatasetEntry( dataset_id=ds.id, @@ -65,8 +66,9 @@ def _make_dataset(db, collection_id, user_id, author_channel_ids): matched_criteria=["percentil"], ) db.add(entry) + entries.append(entry) db.flush() - return ds + return ds, entries # --------------------------------------------------------------------------- @@ -183,7 +185,7 @@ def test_datasets_retorna_campos_completos(self, client, db, auth_as_user): col = _make_collection(db, auth_as_user.id, video_id="vid_ds") _make_comments(db, col.id, "ch1", count=4) _make_comments(db, col.id, "ch2", count=3) - ds = _make_dataset(db, col.id, auth_as_user.id, ["ch1", "ch2"]) + ds, _ = _make_dataset(db, col.id, auth_as_user.id, ["ch1", "ch2"]) db.commit() resp = client.get("/data/datasets") @@ -221,37 +223,29 @@ def test_annotations_sem_token_retorna_401(self, client): def test_annotations_progresso_com_conflito_pendente( self, client, db, auth_as_user, admin_user ): - """Dataset com conflito pendente: bots_comments=0 (sem rótulo final).""" + """Dataset com conflito pendente: bots=0 (sem rótulo final).""" col = _make_collection(db, auth_as_user.id) - comments = _make_comments(db, col.id, "ch_ann", count=3) - ds = _make_dataset(db, col.id, auth_as_user.id, ["ch_ann"]) + _make_comments(db, col.id, "ch_ann", count=3) + ds, entries = _make_dataset(db, col.id, auth_as_user.id, ["ch_ann"]) + entry = entries[0] - # Anotar 2 dos 3 comentários + # Anotações divergentes → conflito pendente ann1 = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotator_id=auth_as_user.id, label="bot", justificativa="teste", ) - ann2 = Annotation( - comment_id=comments[1].id, - annotator_id=auth_as_user.id, - label="humano", - ) - db.add_all([ann1, ann2]) - db.flush() - - # Conflito pendente no primeiro comentário ann1_admin = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotator_id=admin_user.id, label="humano", ) - db.add(ann1_admin) + db.add_all([ann1, ann1_admin]) db.flush() conflict = AnnotationConflict( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotation_a_id=ann1.id, annotation_b_id=ann1_admin.id, status="pending", @@ -265,26 +259,24 @@ def test_annotations_progresso_com_conflito_pendente( assert len(data) == 1 progress = data[0] assert progress["dataset_id"] == str(ds.id) - assert progress["total"] == 3 - assert progress["annotated"] == 2 - assert progress["pending"] == 1 + assert progress["total_users"] == 1 + assert progress["annotated_users"] == 1 + assert progress["pending_users"] == 0 assert progress["conflicts"] == 1 assert progress["conflicts_resolved"] == 0 assert progress["annotators_count"] == 2 - # Conflito pendente → sem rótulo final de bot - assert progress["bots_comments"] == 0 - assert progress["bots_users"] == 0 + assert progress["bots"] == 0 def test_bots_por_consenso(self, client, db, auth_as_user, admin_user): - """Dois anotadores concordam como bot → bots_comments=1, bots_users=1.""" + """Dois anotadores concordam como bot → bots=1.""" col = _make_collection(db, auth_as_user.id, video_id="vid_bot_cons") - comments = _make_comments(db, col.id, "ch_bot", count=2) - _make_dataset(db, col.id, auth_as_user.id, ["ch_bot"]) + _make_comments(db, col.id, "ch_bot", count=2) + ds, entries = _make_dataset(db, col.id, auth_as_user.id, ["ch_bot"]) + entry = entries[0] - # Ambos anotam comment[0] como bot (consenso) db.add( Annotation( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotator_id=auth_as_user.id, label="bot", justificativa="padrão", @@ -292,55 +284,45 @@ def test_bots_por_consenso(self, client, db, auth_as_user, admin_user): ) db.add( Annotation( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotator_id=admin_user.id, label="bot", justificativa="concordo", ) ) - # Comment[1] anotado como humano - db.add( - Annotation( - comment_id=comments[1].id, - annotator_id=auth_as_user.id, - label="humano", - ) - ) db.commit() resp = client.get("/data/annotations") data = resp.json() assert len(data) == 1 progress = data[0] - assert progress["bots_comments"] == 1 - assert progress["bots_users"] == 1 - assert progress["annotated"] == 2 + assert progress["bots"] == 1 + assert progress["annotated_users"] == 1 assert progress["conflicts"] == 0 def test_bots_por_conflito_resolvido(self, client, db, auth_as_user, admin_user): - """Conflito resolvido como bot → conta em bots_comments e bots_users.""" + """Conflito resolvido como bot → conta em bots.""" col = _make_collection(db, auth_as_user.id, video_id="vid_bot_res") - comments = _make_comments(db, col.id, "ch_resolved", count=1) - _make_dataset(db, col.id, auth_as_user.id, ["ch_resolved"]) + _make_comments(db, col.id, "ch_resolved", count=1) + ds, entries = _make_dataset(db, col.id, auth_as_user.id, ["ch_resolved"]) + entry = entries[0] - # Anotações divergentes ann_a = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotator_id=auth_as_user.id, label="bot", justificativa="spam", ) ann_b = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotator_id=admin_user.id, label="humano", ) db.add_all([ann_a, ann_b]) db.flush() - # Conflito resolvido como bot conflict = AnnotationConflict( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotation_a_id=ann_a.id, annotation_b_id=ann_b.id, status="resolved", @@ -355,27 +337,27 @@ def test_bots_por_conflito_resolvido(self, client, db, auth_as_user, admin_user) data = resp.json() assert len(data) == 1 progress = data[0] - assert progress["bots_comments"] == 1 - assert progress["bots_users"] == 1 + assert progress["bots"] == 1 assert progress["conflicts"] == 1 assert progress["conflicts_resolved"] == 1 def test_conflito_resolvido_como_humano_nao_conta_bot( self, client, db, auth_as_user, admin_user ): - """Conflito resolvido como humano → bots_comments=0.""" + """Conflito resolvido como humano → bots=0.""" col = _make_collection(db, auth_as_user.id, video_id="vid_hum_res") - comments = _make_comments(db, col.id, "ch_hum", count=1) - _make_dataset(db, col.id, auth_as_user.id, ["ch_hum"]) + _make_comments(db, col.id, "ch_hum", count=1) + ds, entries = _make_dataset(db, col.id, auth_as_user.id, ["ch_hum"]) + entry = entries[0] ann_a = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotator_id=auth_as_user.id, label="bot", justificativa="parece bot", ) ann_b = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotator_id=admin_user.id, label="humano", ) @@ -383,7 +365,7 @@ def test_conflito_resolvido_como_humano_nao_conta_bot( db.flush() conflict = AnnotationConflict( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotation_a_id=ann_a.id, annotation_b_id=ann_b.id, status="resolved", @@ -397,13 +379,11 @@ def test_conflito_resolvido_como_humano_nao_conta_bot( resp = client.get("/data/annotations") data = resp.json() progress = data[0] - assert progress["bots_comments"] == 0 - assert progress["bots_users"] == 0 + assert progress["bots"] == 0 def test_dataset_sem_entries_retorna_zeros(self, client, db, auth_as_user): """Dataset vazio (sem entries) retorna todos os contadores zerados.""" col = _make_collection(db, auth_as_user.id, video_id="vid_empty") - # Dataset sem entries ds = Dataset( name=f"empty_{uuid.uuid4().hex[:6]}", collection_id=col.id, @@ -420,9 +400,9 @@ def test_dataset_sem_entries_retorna_zeros(self, client, db, auth_as_user): data = resp.json() assert len(data) == 1 progress = data[0] - assert progress["total"] == 0 - assert progress["annotated"] == 0 - assert progress["bots_comments"] == 0 + assert progress["total_users"] == 0 + assert progress["annotated_users"] == 0 + assert progress["bots"] == 0 assert progress["annotators_count"] == 0 @@ -479,4 +459,5 @@ def test_dataset_entry_with_no_comments(self, client, db, auth_as_user): assert resp.status_code == 200 data = resp.json() assert len(data) == 1 - assert data[0]["total"] == 0 + assert data[0]["total_users"] == 1 + assert data[0]["annotated_users"] == 0 diff --git a/backend/tests/test_review.py b/backend/tests/test_review.py index 58c3263..d085e6d 100644 --- a/backend/tests/test_review.py +++ b/backend/tests/test_review.py @@ -47,7 +47,7 @@ def _make_comments(db, collection_id, author_channel_id, count=3): return comments -def _make_dataset(db, collection_id, user_id, author_channel_ids): +def _make_dataset_with_entries(db, collection_id, user_id, author_channel_ids): ds = Dataset( name=f"test_dataset_{uuid.uuid4().hex[:6]}", collection_id=collection_id, @@ -59,6 +59,7 @@ def _make_dataset(db, collection_id, user_id, author_channel_ids): ) db.add(ds) db.flush() + entries = [] for channel_id in author_channel_ids: entry = DatasetEntry( dataset_id=ds.id, @@ -68,20 +69,21 @@ def _make_dataset(db, collection_id, user_id, author_channel_ids): matched_criteria=["percentil"], ) db.add(entry) + entries.append(entry) db.flush() - return ds + return ds, entries -def _make_conflict(db, comment, user_a, user_b, *, label_a="bot", label_b="humano"): - """Cria duas anotacoes divergentes e o AnnotationConflict correspondente.""" +def _make_conflict(db, entry, user_a, user_b, *, label_a="bot", label_b="humano"): + """Cria duas anotacoes divergentes por entry e o AnnotationConflict.""" ann_a = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=user_a.id, label=label_a, justificativa="Repetitivo" if label_a == "bot" else None, ) ann_b = Annotation( - comment_id=comment.id, + dataset_entry_id=entry.id, annotator_id=user_b.id, label=label_b, justificativa="Repetitivo" if label_b == "bot" else None, @@ -90,7 +92,7 @@ def _make_conflict(db, comment, user_a, user_b, *, label_a="bot", label_b="human db.flush() conflict = AnnotationConflict( - comment_id=comment.id, + dataset_entry_id=entry.id, annotation_a_id=ann_a.id, annotation_b_id=ann_b.id, ) @@ -134,16 +136,18 @@ def annotator_b(db): @pytest.fixture def setup_conflict(db, admin_user, annotator_a, annotator_b): - """Cria cenario completo: coleta + comentarios + dataset + conflito.""" + """Cria cenario completo: coleta + comentarios + dataset + entry + conflito.""" col = _make_collection(db, admin_user.id) comments = _make_comments(db, col.id, "UC_suspect1") - ds = _make_dataset(db, col.id, admin_user.id, ["UC_suspect1"]) - conflict, ann_a, ann_b = _make_conflict(db, comments[0], annotator_a, annotator_b) + ds, entries = _make_dataset_with_entries(db, col.id, admin_user.id, ["UC_suspect1"]) + entry = entries[0] + conflict, ann_a, ann_b = _make_conflict(db, entry, annotator_a, annotator_b) db.commit() return { "collection": col, "comments": comments, "dataset": ds, + "entry": entry, "conflict": conflict, "ann_a": ann_a, "ann_b": ann_b, @@ -193,13 +197,12 @@ def test_lista_conflitos_pendentes(self, client, auth_as_admin, setup_conflict): resp = client.get("/review/conflicts?status=pending") assert resp.status_code == 200 body = resp.json() - assert "items" in body data = body["items"] assert len(data) == 1 assert data[0]["status"] == "pending" assert data[0]["label_a"] == "bot" assert data[0]["label_b"] == "humano" - assert "text_original" in data[0] + assert "entry_id" in data[0] assert "dataset_id" in data[0] assert body["total"] == 1 @@ -263,7 +266,6 @@ def test_resolve_como_bot(self, client, db, auth_as_admin, setup_conflict): assert data["resolved_label"] == "bot" assert data["resolved_by"] == "Admin Teste" - # Verificar que Resolution foi criada no banco resolution = ( db.query(Resolution) .filter(Resolution.conflict_id == setup_conflict["conflict"].id) @@ -285,12 +287,10 @@ def test_conflito_ja_resolvido_retorna_409( self, client, auth_as_admin, setup_conflict ): cid = str(setup_conflict["conflict"].id) - # Primeira resolucao client.post( "/review/resolve", json={"conflict_id": cid, "resolved_label": "bot"}, ) - # Segunda tentativa resp = client.post( "/review/resolve", json={"conflict_id": cid, "resolved_label": "humano"}, @@ -326,72 +326,74 @@ def test_lista_usuarios_com_anotacao_bot( self, client, db, auth_as_admin, admin_user, annotator_a, annotator_b ): col = _make_collection(db, admin_user.id) - comments_a = _make_comments(db, col.id, "UC_botA", count=2) - comments_b = _make_comments(db, col.id, "UC_humanB", count=2) - _make_dataset(db, col.id, admin_user.id, ["UC_botA", "UC_humanB"]) - - # UC_botA: ambos anotam como bot (consenso) - for c in comments_a: - db.add( - Annotation( - comment_id=c.id, - annotator_id=annotator_a.id, - label="bot", - justificativa="Spam", - ) + _make_comments(db, col.id, "UC_botA", count=2) + _make_comments(db, col.id, "UC_humanB", count=2) + ds, entries = _make_dataset_with_entries( + db, col.id, admin_user.id, ["UC_botA", "UC_humanB"] + ) + + entry_bot = next(e for e in entries if e.author_channel_id == "UC_botA") + entry_human = next(e for e in entries if e.author_channel_id == "UC_humanB") + + # UC_botA: ambos anotam como bot + db.add( + Annotation( + dataset_entry_id=entry_bot.id, + annotator_id=annotator_a.id, + label="bot", + justificativa="Spam", ) - db.add( - Annotation( - comment_id=c.id, - annotator_id=annotator_b.id, - label="bot", - justificativa="Spam", - ) + ) + db.add( + Annotation( + dataset_entry_id=entry_bot.id, + annotator_id=annotator_b.id, + label="bot", + justificativa="Spam", ) + ) # UC_humanB: ambos anotam como humano - for c in comments_b: - db.add( - Annotation( - comment_id=c.id, - annotator_id=annotator_a.id, - label="humano", - ) + db.add( + Annotation( + dataset_entry_id=entry_human.id, + annotator_id=annotator_a.id, + label="humano", ) - db.add( - Annotation( - comment_id=c.id, - annotator_id=annotator_b.id, - label="humano", - ) + ) + db.add( + Annotation( + dataset_entry_id=entry_human.id, + annotator_id=annotator_b.id, + label="humano", ) + ) db.commit() resp = client.get("/review/bots") assert resp.status_code == 200 body = resp.json() data = body["items"] - # Comentarios de UC_botA aparecem (tem anotacao bot), UC_humanB nao channel_ids = [item["author_channel_id"] for item in data] assert "UC_botA" in channel_ids assert "UC_humanB" not in channel_ids - # Cada item e um comentario com anotacoes detalhadas for item in data: - assert "text_original" in item assert "annotations" in item assert len(item["annotations"]) > 0 def test_consenso_bot_nao_aparece_em_conflicts( self, client, db, auth_as_admin, admin_user, annotator_a, annotator_b ): - """Consenso bot+bot aparece em /bots mas nao em /conflicts.""" col = _make_collection(db, admin_user.id) - comments = _make_comments(db, col.id, "UC_consenso_bot", count=1) - _make_dataset(db, col.id, admin_user.id, ["UC_consenso_bot"]) + _make_comments(db, col.id, "UC_consenso_bot", count=1) + ds, entries = _make_dataset_with_entries( + db, col.id, admin_user.id, ["UC_consenso_bot"] + ) + entry = entries[0] db.add( Annotation( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotator_id=annotator_a.id, label="bot", justificativa="Spam", @@ -399,7 +401,7 @@ def test_consenso_bot_nao_aparece_em_conflicts( ) db.add( Annotation( - comment_id=comments[0].id, + dataset_entry_id=entry.id, annotator_id=annotator_b.id, label="bot", justificativa="Repetitivo", @@ -407,7 +409,6 @@ def test_consenso_bot_nao_aparece_em_conflicts( ) db.commit() - # Em /bots (por comentario) resp_bots = client.get("/review/bots") bot_items = [ b @@ -417,12 +418,9 @@ def test_consenso_bot_nao_aparece_em_conflicts( assert len(bot_items) == 1 assert bot_items[0]["has_conflict"] is False - # Nao em /conflicts resp_conflicts = client.get("/review/conflicts") - comment_ids_in_conflicts = [ - c["comment_id"] for c in resp_conflicts.json()["items"] - ] - assert str(comments[0].id) not in comment_ids_in_conflicts + entry_ids_in_conflicts = [c["entry_id"] for c in resp_conflicts.json()["items"]] + assert str(entry.id) not in entry_ids_in_conflicts # --------------------------------------------------------------------------- @@ -460,7 +458,6 @@ class TestReviewExport: def test_export_json(self, client, db, auth_as_admin, setup_conflict, admin_user): ds_id = str(setup_conflict["dataset"].id) - # Resolver o conflito primeiro cid = str(setup_conflict["conflict"].id) client.post( "/review/resolve", @@ -470,7 +467,7 @@ def test_export_json(self, client, db, auth_as_admin, setup_conflict, admin_user resp = client.get(f"/review/export?dataset_id={ds_id}&format=json") assert resp.status_code == 200 data = resp.json() - assert "comments" in data + assert "users" in data assert data["dataset_name"].startswith("test_dataset_") def test_export_csv(self, client, auth_as_admin, setup_conflict): @@ -483,7 +480,7 @@ def test_export_csv(self, client, auth_as_admin, setup_conflict): resp = client.get(f"/review/export?dataset_id={ds_id}&format=csv") assert resp.status_code == 200 - assert "comment_db_id" in resp.text + assert "entry_id" in resp.text # --------------------------------------------------------------------------- @@ -493,18 +490,17 @@ def test_export_csv(self, client, auth_as_admin, setup_conflict): class TestReviewImport: def test_import_resolve_conflito(self, client, db, auth_as_admin, setup_conflict): - comment = setup_conflict["comments"][0] + entry = setup_conflict["entry"] resp = client.post( "/review/import", json={ "dataset_name": "test", "video_id": "vid123", - "comments": [ + "users": [ { - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "author_channel_id": "UC_suspect1", "author_display_name": "User UC_suspect1", - "text_original": "texto", "final_label": "bot", "resolution": { "resolved_by": "Admin", @@ -524,12 +520,11 @@ def test_import_video_id_inexistente_retorna_404(self, client, auth_as_admin): json={ "dataset_name": "test", "video_id": "inexistente", - "comments": [ + "users": [ { - "comment_db_id": str(uuid.uuid4()), + "entry_id": str(uuid.uuid4()), "author_channel_id": "UC_x", "author_display_name": "X", - "text_original": "t", "final_label": "bot", "resolution": {"resolved_label": "bot"}, } @@ -540,13 +535,12 @@ def test_import_video_id_inexistente_retorna_404(self, client, auth_as_admin): # --------------------------------------------------------------------------- -# Testes adicionais — cobertura 100% +# Testes adicionais — cobertura # --------------------------------------------------------------------------- class TestListConflictsVideoFilter: def test_filtro_por_video_id(self, client, auth_as_admin, setup_conflict): - """Filtrar conflitos por video_id retorna resultados.""" resp = client.get("/review/conflicts?video_id=vid123") assert resp.status_code == 200 assert len(resp.json()["items"]) == 1 @@ -554,7 +548,6 @@ def test_filtro_por_video_id(self, client, auth_as_admin, setup_conflict): def test_filtro_por_video_id_inexistente( self, client, auth_as_admin, setup_conflict ): - """Filtrar conflitos por video_id inexistente retorna vazio.""" resp = client.get("/review/conflicts?video_id=nao_existe") assert resp.status_code == 200 assert resp.json()["items"] == [] @@ -564,15 +557,10 @@ class TestListConflictsStatusFilter: def test_filtro_por_status_resolved( self, client, db, auth_as_admin, setup_conflict ): - """Filtrar conflitos por status resolved funciona.""" - # Resolver o conflito cid = str(setup_conflict["conflict"].id) client.post( "/review/resolve", - json={ - "conflict_id": cid, - "resolved_label": "bot", - }, + json={"conflict_id": cid, "resolved_label": "bot"}, ) resp = client.get("/review/conflicts?status=resolved") assert resp.status_code == 200 @@ -580,7 +568,6 @@ def test_filtro_por_status_resolved( assert len(items) == 1 assert items[0]["status"] == "resolved" - # Pendentes zerados resp2 = client.get("/review/conflicts?status=pending") assert resp2.json()["items"] == [] @@ -589,15 +576,10 @@ class TestConflictDetailResolved: def test_resolved_conflict_mostra_resolver( self, client, db, auth_as_admin, admin_user, setup_conflict ): - """Conflito resolvido mostra nome do admin.""" cid = str(setup_conflict["conflict"].id) - # Resolver client.post( "/review/resolve", - json={ - "conflict_id": cid, - "resolved_label": "bot", - }, + json={"conflict_id": cid, "resolved_label": "bot"}, ) resp = client.get(f"/review/conflicts/{cid}") assert resp.status_code == 200 @@ -615,13 +597,14 @@ def test_bots_com_filtro_video_id( admin_user, annotator_a, ): - """Filtrar bots por video_id funciona.""" col = _make_collection(db, admin_user.id) - comments = _make_comments(db, col.id, "UC_bot_v", count=1) - _make_dataset(db, col.id, admin_user.id, ["UC_bot_v"]) + _make_comments(db, col.id, "UC_bot_v", count=1) + ds, entries = _make_dataset_with_entries( + db, col.id, admin_user.id, ["UC_bot_v"] + ) db.add( Annotation( - comment_id=comments[0].id, + dataset_entry_id=entries[0].id, annotator_id=annotator_a.id, label="bot", justificativa="Spam", @@ -635,7 +618,6 @@ def test_bots_com_filtro_video_id( assert data["total"] >= 1 def test_bots_sem_resultados_empty_page(self, client, auth_as_admin): - """Sem bots, retorna pagina vazia.""" resp = client.get("/review/bots") assert resp.status_code == 200 body = resp.json() @@ -643,7 +625,6 @@ def test_bots_sem_resultados_empty_page(self, client, auth_as_admin): assert body["total"] == 0 def test_bots_com_dataset_id_inexistente(self, client, auth_as_admin): - """Filtrar bots por dataset_id inexistente retorna vazio.""" resp = client.get(f"/review/bots?dataset_id={uuid.uuid4()}") assert resp.status_code == 200 assert resp.json()["items"] == [] @@ -651,13 +632,11 @@ def test_bots_com_dataset_id_inexistente(self, client, auth_as_admin): class TestExportReviewEdgeCases: def test_export_json_dataset_inexistente(self, client, auth_as_admin): - """Export com dataset_id inexistente retorna 404.""" - resp = client.get(f"/review/export?dataset_id={uuid.uuid4()}" "&format=json") + resp = client.get(f"/review/export?dataset_id={uuid.uuid4()}&format=json") assert resp.status_code == 404 def test_export_csv_dataset_inexistente(self, client, auth_as_admin): - """Export CSV com dataset_id inexistente retorna 404.""" - resp = client.get(f"/review/export?dataset_id={uuid.uuid4()}" "&format=csv") + resp = client.get(f"/review/export?dataset_id={uuid.uuid4()}&format=csv") assert resp.status_code == 404 def test_export_json_consenso_label( @@ -669,14 +648,12 @@ def test_export_json_consenso_label( annotator_a, annotator_b, ): - """Export com anotacoes sem conflito usa label consenso.""" col = _make_collection(db, admin_user.id) - comments = _make_comments(db, col.id, "UC_cons", count=1) - ds = _make_dataset(db, col.id, admin_user.id, ["UC_cons"]) - # Ambos anotam como bot (consenso) + _make_comments(db, col.id, "UC_cons", count=1) + ds, entries = _make_dataset_with_entries(db, col.id, admin_user.id, ["UC_cons"]) db.add( Annotation( - comment_id=comments[0].id, + dataset_entry_id=entries[0].id, annotator_id=annotator_a.id, label="bot", justificativa="Spam", @@ -684,7 +661,7 @@ def test_export_json_consenso_label( ) db.add( Annotation( - comment_id=comments[0].id, + dataset_entry_id=entries[0].id, annotator_id=annotator_b.id, label="bot", justificativa="Repetitivo", @@ -692,34 +669,30 @@ def test_export_json_consenso_label( ) db.commit() - resp = client.get(f"/review/export?dataset_id={ds.id}" "&format=json") + resp = client.get(f"/review/export?dataset_id={ds.id}&format=json") assert resp.status_code == 200 data = resp.json() - assert len(data["comments"]) == 1 - assert data["comments"][0]["final_label"] == "bot" - assert data["comments"][0]["resolution"] is None + assert len(data["users"]) == 1 + assert data["users"][0]["final_label"] == "bot" + assert data["users"][0]["resolution"] is None -class TestResolveCommentsEdgeCases: - def test_missing_comment_skip( +class TestResolveUsersEdgeCases: + def test_missing_entry_skip( self, client, db, auth_as_admin, admin_user, setup_conflict ): - """Import com comment_db_id inexistente pula e reporta.""" resp = client.post( "/review/import", json={ "dataset_name": "test", "video_id": "vid123", - "comments": [ + "users": [ { - "comment_db_id": str(uuid.uuid4()), + "entry_id": str(uuid.uuid4()), "author_channel_id": "UC_x", "author_display_name": "X", - "text_original": "t", "final_label": "bot", - "resolution": { - "resolved_label": "bot", - }, + "resolution": {"resolved_label": "bot"}, }, ], }, @@ -730,19 +703,17 @@ def test_missing_comment_skip( assert len(data["errors"]) == 1 def test_no_resolution_field_skip(self, client, db, auth_as_admin, setup_conflict): - """Import sem campo resolution pula o comentario.""" - comment = setup_conflict["comments"][0] + entry = setup_conflict["entry"] resp = client.post( "/review/import", json={ "dataset_name": "test", "video_id": "vid123", - "comments": [ + "users": [ { - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "author_channel_id": "UC_suspect1", "author_display_name": "X", - "text_original": "t", "final_label": "bot", }, ], @@ -751,10 +722,12 @@ def test_no_resolution_field_skip(self, client, db, auth_as_admin, setup_conflic assert resp.status_code == 200 assert resp.json()["skipped"] == 1 - def test_no_conflict_for_comment_skip(self, client, db, auth_as_admin, admin_user): - """Import com comentario sem conflito registrado pula.""" + def test_no_conflict_for_entry_skip(self, client, db, auth_as_admin, admin_user): col = _make_collection(db, admin_user.id) - comments = _make_comments(db, col.id, "UC_noconflict", count=1) + _make_comments(db, col.id, "UC_noconflict", count=1) + ds, entries = _make_dataset_with_entries( + db, col.id, admin_user.id, ["UC_noconflict"] + ) db.commit() resp = client.post( @@ -762,16 +735,13 @@ def test_no_conflict_for_comment_skip(self, client, db, auth_as_admin, admin_use json={ "dataset_name": "test", "video_id": "vid123", - "comments": [ + "users": [ { - "comment_db_id": str(comments[0].id), + "entry_id": str(entries[0].id), "author_channel_id": "UC_noconflict", "author_display_name": "X", - "text_original": "t", "final_label": "bot", - "resolution": { - "resolved_label": "bot", - }, + "resolution": {"resolved_label": "bot"}, }, ], }, @@ -784,35 +754,26 @@ def test_no_conflict_for_comment_skip(self, client, db, auth_as_admin, admin_use def test_already_resolved_conflict_skip( self, client, db, auth_as_admin, setup_conflict ): - """Import com conflito ja resolvido pula.""" - comment = setup_conflict["comments"][0] + entry = setup_conflict["entry"] cid = str(setup_conflict["conflict"].id) - # Resolver primeiro client.post( "/review/resolve", - json={ - "conflict_id": cid, - "resolved_label": "bot", - }, + json={"conflict_id": cid, "resolved_label": "bot"}, ) - # Tentar importar novamente resp = client.post( "/review/import", json={ "dataset_name": "test", "video_id": "vid123", - "comments": [ + "users": [ { - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "author_channel_id": "UC_suspect1", "author_display_name": "X", - "text_original": "t", "final_label": "humano", - "resolution": { - "resolved_label": "humano", - }, + "resolution": {"resolved_label": "humano"}, }, ], }, @@ -821,23 +782,19 @@ def test_already_resolved_conflict_skip( assert resp.json()["skipped"] == 1 def test_invalid_label_skip(self, client, db, auth_as_admin, setup_conflict): - """Import com label invalido pula e reporta.""" - comment = setup_conflict["comments"][0] + entry = setup_conflict["entry"] resp = client.post( "/review/import", json={ "dataset_name": "test", "video_id": "vid123", - "comments": [ + "users": [ { - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "author_channel_id": "UC_suspect1", "author_display_name": "X", - "text_original": "t", "final_label": "bot", - "resolution": { - "resolved_label": "incerto", - }, + "resolution": {"resolved_label": "incerto"}, }, ], }, @@ -852,21 +809,17 @@ class TestImportReviewChunk: def test_import_chunk_resolve_conflitos( self, client, db, auth_as_admin, setup_conflict ): - """Import-chunk resolve conflitos em batch.""" - comment = setup_conflict["comments"][0] + entry = setup_conflict["entry"] resp = client.post( "/review/import-chunk", json={ - "comments": [ + "users": [ { - "comment_db_id": str(comment.id), + "entry_id": str(entry.id), "author_channel_id": "UC_suspect1", "author_display_name": "X", - "text_original": "t", "final_label": "bot", - "resolution": { - "resolved_label": "bot", - }, + "resolution": {"resolved_label": "bot"}, }, ], "done": True, @@ -886,7 +839,6 @@ def test_import_chunk_resolve_conflitos( class TestExportReviewJsonInvalidDataset: def test_export_json_missing_dataset_yields_error(self, db): - """export_review_json com dataset_id inexistente gera JSON de erro.""" from services.review import export_review_json gen = export_review_json(db, uuid.uuid4()) @@ -897,7 +849,6 @@ def test_export_json_missing_dataset_yields_error(self, db): class TestExportReviewCsvInvalidDataset: def test_export_csv_missing_dataset_yields_error(self, db): - """export_review_csv com dataset_id inexistente gera CSV de erro.""" from services.review import export_review_csv gen = export_review_csv(db, uuid.uuid4()) @@ -907,15 +858,15 @@ def test_export_csv_missing_dataset_yields_error(self, db): class TestExportReviewJsonConsensusLabel: def test_single_label_consensus_in_json_export(self, db, admin_user, annotator_a): - """Export JSON com 1 anotacao usa label como consenso.""" from services.review import export_review_json col = _make_collection(db, admin_user.id) - comments = _make_comments(db, col.id, "UC_cons_j", 1) - ds = _make_dataset(db, col.id, admin_user.id, ["UC_cons_j"]) - # Apenas 1 anotacao — sem conflito + _make_comments(db, col.id, "UC_cons_j", 1) + ds, entries = _make_dataset_with_entries( + db, col.id, admin_user.id, ["UC_cons_j"] + ) ann = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entries[0].id, annotator_id=annotator_a.id, label="humano", ) @@ -927,28 +878,26 @@ def test_single_label_consensus_in_json_export(self, db, admin_user, annotator_a gen = export_review_json(db, ds.id) output = "".join(gen) data = json.loads(output) - assert len(data["comments"]) == 1 - assert data["comments"][0]["final_label"] == "humano" + assert len(data["users"]) == 1 + assert data["users"][0]["final_label"] == "humano" def test_pending_label_when_divergent_no_resolution( self, db, admin_user, annotator_a, annotator_b ): - """Export JSON com anotacoes divergentes sem resolucao - resulta em final_label=pending.""" from services.review import export_review_json col = _make_collection(db, admin_user.id, video_id="vid_pend") - comments = _make_comments(db, col.id, "UC_pend", 1) - ds = _make_dataset(db, col.id, admin_user.id, ["UC_pend"]) + _make_comments(db, col.id, "UC_pend", 1) + ds, entries = _make_dataset_with_entries(db, col.id, admin_user.id, ["UC_pend"]) ann_a = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entries[0].id, annotator_id=annotator_a.id, label="bot", justificativa="spam", ) ann_b = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entries[0].id, annotator_id=annotator_b.id, label="humano", ) @@ -960,30 +909,30 @@ def test_pending_label_when_divergent_no_resolution( gen = export_review_json(db, ds.id) output = "".join(gen) data = json.loads(output) - assert len(data["comments"]) == 1 - assert data["comments"][0]["final_label"] == "pending" + assert len(data["users"]) == 1 + assert data["users"][0]["final_label"] == "pending" class TestExportReviewCsvConsensusLabel: def test_pending_label_in_csv_export( self, db, admin_user, annotator_a, annotator_b ): - """Export CSV com labels divergentes sem resolucao - resulta em final_label=pending.""" from services.review import export_review_csv col = _make_collection(db, admin_user.id, video_id="vid_csv_p") - comments = _make_comments(db, col.id, "UC_csv_p", 1) - ds = _make_dataset(db, col.id, admin_user.id, ["UC_csv_p"]) + _make_comments(db, col.id, "UC_csv_p", 1) + ds, entries = _make_dataset_with_entries( + db, col.id, admin_user.id, ["UC_csv_p"] + ) ann_a = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entries[0].id, annotator_id=annotator_a.id, label="bot", justificativa="spam", ) ann_b = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entries[0].id, annotator_id=annotator_b.id, label="humano", ) @@ -995,15 +944,16 @@ def test_pending_label_in_csv_export( assert "pending" in output def test_single_label_consensus_in_csv_export(self, db, admin_user, annotator_a): - """Export CSV com 1 anotacao usa label como consenso.""" from services.review import export_review_csv col = _make_collection(db, admin_user.id, video_id="vid_csv_c") - comments = _make_comments(db, col.id, "UC_csv_c", 1) - ds = _make_dataset(db, col.id, admin_user.id, ["UC_csv_c"]) + _make_comments(db, col.id, "UC_csv_c", 1) + ds, entries = _make_dataset_with_entries( + db, col.id, admin_user.id, ["UC_csv_c"] + ) ann = Annotation( - comment_id=comments[0].id, + dataset_entry_id=entries[0].id, annotator_id=annotator_a.id, label="bot", justificativa="spam", @@ -1014,29 +964,3 @@ def test_single_label_consensus_in_csv_export(self, db, admin_user, annotator_a) gen = export_review_csv(db, ds.id) output = "".join(gen) assert "bot" in output - - -class TestFindDatasetForComment: - def test_no_entry_returns_none(self, db, admin_user): - """Comentario sem entry de dataset retorna None.""" - from services.review import _find_dataset_for_comment - - col = _make_collection(db, admin_user.id, video_id="vid_find") - comments = _make_comments(db, col.id, "UC_find", 1) - db.commit() - - result = _find_dataset_for_comment(db, comments[0]) - assert result is None - - def test_with_entry_returns_dataset(self, db, admin_user): - """Comentario com entry retorna dataset correto.""" - from services.review import _find_dataset_for_comment - - col = _make_collection(db, admin_user.id, video_id="vid_find2") - comments = _make_comments(db, col.id, "UC_find2", 1) - ds = _make_dataset(db, col.id, admin_user.id, ["UC_find2"]) - db.commit() - - result = _find_dataset_for_comment(db, comments[0]) - assert result is not None - assert result.id == ds.id diff --git a/backend/tests/test_seed.py b/backend/tests/test_seed.py index 5e29384..fa2d926 100644 --- a/backend/tests/test_seed.py +++ b/backend/tests/test_seed.py @@ -61,19 +61,22 @@ def test_seed_creates_correct_annotations(client, db, auth_as_admin): col = db.query(Collection).filter(Collection.video_id == SEED_VIDEO_ID).first() - # Contar anotações - comment_ids = ( - db.query(Comment.id).filter(Comment.collection_id == col.id).subquery() + # Contar anotações (agora por entry, não por comment) + from models.dataset import Dataset, DatasetEntry + + ds = db.query(Dataset).filter(Dataset.collection_id == col.id).first() + entry_ids = ( + db.query(DatasetEntry.id).filter(DatasetEntry.dataset_id == ds.id).subquery() ) ann_count = ( - db.query(Annotation).filter(Annotation.comment_id.in_(comment_ids)).count() + db.query(Annotation).filter(Annotation.dataset_entry_id.in_(entry_ids)).count() ) assert ann_count == resp.json()["annotations_created"] # Contar conflitos conflict_count = ( db.query(AnnotationConflict) - .filter(AnnotationConflict.comment_id.in_(comment_ids)) + .filter(AnnotationConflict.dataset_entry_id.in_(entry_ids)) .count() ) assert conflict_count == resp.json()["conflicts_created"] From 5efd7bb3b60fb79739ea05cbc1f8edb564c2bd00 Mon Sep 17 00:00:00 2001 From: lucasbrentano Date: Wed, 8 Apr 2026 19:29:43 -0300 Subject: [PATCH 2/2] =?UTF-8?q?refactor(frontend):=20adaptar=20UI=20para?= =?UTF-8?q?=20anota=C3=A7=C3=A3o=20por=20usu=C3=A1rio=20(#87)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - API types: entry_id, BotUserItem, CommentItem (sem anotação individual) - Hooks: submitAnnotation recebe entryId, state atualiza por entry - AnnotatePage: progresso por usuários, tabela com classificação/status - UserCommentsList: botões Humano/Bot no nível do autor com ícones - ReviewPage: conflitos e bots agrupados por usuário, botões padronizados - Removido CommentAnnotationRow (anotação por comentário não existe mais) Co-Authored-By: Claude --- frontend/src/api/annotate.ts | 25 +- frontend/src/api/review.ts | 25 +- frontend/src/hooks/useAnnotate.ts | 32 +- frontend/src/hooks/useReview.ts | 5 +- frontend/src/pages/Annotate/AnnotatePage.tsx | 106 +++---- .../pages/Annotate/CommentAnnotationRow.tsx | 281 ------------------ .../src/pages/Annotate/UserCommentsList.tsx | 212 +++++++++---- frontend/src/pages/Review/ReviewPage.tsx | 132 ++++---- 8 files changed, 318 insertions(+), 500 deletions(-) delete mode 100644 frontend/src/pages/Annotate/CommentAnnotationRow.tsx diff --git a/frontend/src/api/annotate.ts b/frontend/src/api/annotate.ts index c753a88..63da4a0 100644 --- a/frontend/src/api/annotate.ts +++ b/frontend/src/api/annotate.ts @@ -7,16 +7,15 @@ export interface UserItem { author_channel_id: string; author_display_name: string; comment_count: number; - my_annotated_count: number; - my_pending_count: number; + is_annotated_by_me: boolean; + my_label: string | null; } export interface DatasetUsersResponse { dataset_id: string; dataset_name: string; total_users: number; - total_comments: number; - annotated_comments_by_me: number; + annotated_users_by_me: number; page: number; page_size: number; total_pages: number; @@ -36,26 +35,26 @@ export interface AnnotatorAnnotation { annotated_at: string; } -export interface CommentWithAnnotation { +export interface CommentItem { comment_db_id: string; text_original: string; like_count: number; reply_count: number; published_at: string; - my_annotation: MyAnnotation | null; - all_annotations?: AnnotatorAnnotation[] | null; } export interface UserCommentsResponse { entry_id: string; author_display_name: string; author_channel_id: string; - comments: CommentWithAnnotation[]; + comments: CommentItem[]; + my_annotation: MyAnnotation | null; + all_annotations?: AnnotatorAnnotation[] | null; } export interface AnnotationResult { annotation_id: string; - comment_db_id: string; + entry_id: string; label: string; conflict_created: boolean; } @@ -63,7 +62,7 @@ export interface AnnotationResult { export interface DatasetProgress { dataset_id: string; dataset_name: string; - total_comments: number; + total_users: number; annotated: number; bots: number; humans: number; @@ -82,7 +81,7 @@ export interface AnnotatorProgress { annotator_name: string; dataset_id: string; dataset_name: string; - total_comments: number; + total_users: number; annotated: number; bots: number; humans: number; @@ -111,7 +110,7 @@ export const annotateApi = { submit: ( data: { - comment_db_id: string; + entry_id: string; label: "bot" | "humano"; justificativa?: string | null; }, @@ -129,7 +128,7 @@ export const annotateApi = { dataset_name?: string; video_id?: string; annotations: Array<{ - comment_db_id: string; + entry_id: string; label: "bot" | "humano"; justificativa?: string | null; }>; diff --git a/frontend/src/api/review.ts b/frontend/src/api/review.ts index 20767e5..8733319 100644 --- a/frontend/src/api/review.ts +++ b/frontend/src/api/review.ts @@ -4,11 +4,12 @@ import { API_URL, ApiError, request } from "./http"; export interface ConflictListItem { conflict_id: string; - comment_id: string; + entry_id: string; dataset_id: string; dataset_name: string; author_display_name: string; - text_original: string; + author_channel_id: string; + comment_count: number; label_a: string; annotator_a: string; justificativa_a: string | null; @@ -62,11 +63,11 @@ export interface BotAnnotationDetail { justificativa: string | null; } -export interface BotCommentItem { - comment_db_id: string; - text_original: string; +export interface BotUserItem { + entry_id: string; author_display_name: string; author_channel_id: string; + comment_count: number; dataset_id: string; dataset_name: string; annotations: BotAnnotationDetail[]; @@ -139,7 +140,7 @@ export const reviewApi = { if (params?.dataset_id) qs.set("dataset_id", params.dataset_id); qs.set("page", String(params?.page ?? 1)); qs.set("page_size", String(params?.page_size ?? 20)); - return request>(`/review/bots?${qs}`, {}, token); + return request>(`/review/bots?${qs}`, {}, token); }, stats: (token: string) => request("/review/stats", {}, token), @@ -148,11 +149,10 @@ export const reviewApi = { data: { dataset_name: string; video_id: string; - comments: Array<{ - comment_db_id: string; + users: Array<{ + entry_id: string; author_channel_id: string; author_display_name: string; - text_original: string; final_label: "bot" | "humano"; annotations?: Array<{ annotator: string; @@ -170,7 +170,7 @@ export const reviewApi = { onProgress?: (sent: number, total: number) => void ): Promise => { const CHUNK_SIZE = 2000; - const all = data.comments; + const all = data.users; const total = all.length; const firstBatch = all.slice(0, CHUNK_SIZE); const hasMore = total > CHUNK_SIZE; @@ -181,7 +181,7 @@ export const reviewApi = { method: "POST", body: JSON.stringify({ ...data, - comments: firstBatch, + users: firstBatch, done: !hasMore, }), }, @@ -205,7 +205,7 @@ export const reviewApi = { "/review/import-chunk", { method: "POST", - body: JSON.stringify({ comments: chunk, done: isLast }), + body: JSON.stringify({ users: chunk, done: isLast }), }, token ); @@ -238,7 +238,6 @@ export const reviewApi = { throw new ApiError(body.detail ?? "Erro ao exportar revisão.", res.status); } - // Use filename from Content-Disposition if available const cd = res.headers.get("Content-Disposition") ?? ""; const filenameMatch = cd.match(/filename="(.+?)"/); const filename = filenameMatch ? filenameMatch[1] : `review.${format}`; diff --git a/frontend/src/hooks/useAnnotate.ts b/frontend/src/hooks/useAnnotate.ts index e5dd137..8bb72f6 100644 --- a/frontend/src/hooks/useAnnotate.ts +++ b/frontend/src/hooks/useAnnotate.ts @@ -83,35 +83,27 @@ export function useAnnotate() { const submitAnnotation = useCallback( async ( - commentDbId: string, + entryId: string, label: "bot" | "humano", justificativa?: string | null ): Promise => { if (!token) return; try { - const result = await annotateApi.submit( - { comment_db_id: commentDbId, label, justificativa }, - token - ); + const result = await annotateApi.submit({ entry_id: entryId, label, justificativa }, token); - // Atualizar o comentário no state local + // Atualizar a anotação no state local (nível do entry) setState((s) => { if (!s.userComments) return s; - const updated = s.userComments.comments.map((c) => - c.comment_db_id === commentDbId - ? { - ...c, - my_annotation: { - label, - justificativa: justificativa ?? null, - annotated_at: new Date().toISOString(), - }, - } - : c - ); return { ...s, - userComments: { ...s.userComments, comments: updated }, + userComments: { + ...s.userComments, + my_annotation: { + label, + justificativa: justificativa ?? null, + annotated_at: new Date().toISOString(), + }, + }, }; }); @@ -158,7 +150,7 @@ export function useAnnotate() { dataset_name?: string; video_id?: string; annotations: Array<{ - comment_db_id: string; + entry_id: string; label: "bot" | "humano"; justificativa?: string | null; }>; diff --git a/frontend/src/hooks/useReview.ts b/frontend/src/hooks/useReview.ts index 585db51..b351f9d 100644 --- a/frontend/src/hooks/useReview.ts +++ b/frontend/src/hooks/useReview.ts @@ -1,7 +1,7 @@ import { useCallback, useState } from "react"; import { reviewApi, - BotCommentItem, + BotUserItem, ConflictDetail, ConflictListItem, ImportResult, @@ -15,7 +15,7 @@ interface ReviewState { error: string | null; conflictsData: PaginatedResponse | null; conflictDetail: ConflictDetail | null; - botsData: PaginatedResponse | null; + botsData: PaginatedResponse | null; stats: ReviewStats | null; importResult: ImportResult | null; } @@ -97,7 +97,6 @@ export function useReview() { token ); - // Update conflict in local state setState((s) => ({ ...s, conflictsData: s.conflictsData diff --git a/frontend/src/pages/Annotate/AnnotatePage.tsx b/frontend/src/pages/Annotate/AnnotatePage.tsx index 167ad46..0d79b4d 100644 --- a/frontend/src/pages/Annotate/AnnotatePage.tsx +++ b/frontend/src/pages/Annotate/AnnotatePage.tsx @@ -144,11 +144,11 @@ export function AnnotatePage() { ); const handleAnnotate = useCallback( - async (commentDbId: string, label: "bot" | "humano", justificativa?: string | null) => { - const result = await submitAnnotation(commentDbId, label, justificativa); + async (entryId: string, label: "bot" | "humano", justificativa?: string | null) => { + const result = await submitAnnotation(entryId, label, justificativa); if (result?.conflict_created) { setToast( - "Conflito detectado: outro pesquisador classificou este comentário de forma diferente." + "Conflito detectado: outro pesquisador classificou este usuário de forma diferente." ); setTimeout(() => setToast(null), 5000); } @@ -176,7 +176,7 @@ export function AnnotatePage() { dataset_name?: string; video_id?: string; annotations?: Array<{ - comment_db_id: string; + entry_id: string; label: "bot" | "humano"; justificativa?: string | null; }>; @@ -210,10 +210,10 @@ export function AnnotatePage() {

- Anotação de Comentários + Anotação de Usuários

- Classifique cada comentário como bot ou humano. Veja todos os comentários do usuário para + Classifique cada usuário como bot ou humano. Analise todos os comentários do autor para tomar uma decisão informada.

@@ -359,11 +359,12 @@ export function AnnotatePage() { }, { label: "Escolha um usuário", - description: "Veja todos os comentários do usuário agrupados para contexto.", + description: "Veja todos os comentários do autor como evidência.", }, { - label: "Classifique cada comentário", - description: 'Marque como "Bot" ou "Humano". Bot exige justificativa.', + label: "Classifique o usuário", + description: + 'Marque o autor como "Bot" ou "Humano". Bot exige justificativa.', }, ]} /> @@ -394,7 +395,7 @@ export function AnnotatePage() {

Meu progresso

- {datasetProgress.annotated}/{datasetProgress.total_comments} comentários + {datasetProgress.annotated}/{datasetProgress.total_users} usuários
- Progresso + Classificação Status @@ -488,50 +489,51 @@ export function AnnotatePage() { - {datasetUsers.items.map((item) => { - const pct = - item.comment_count > 0 - ? Math.round((item.my_annotated_count / item.comment_count) * 100) - : 0; - const done = item.my_pending_count === 0; - return ( - handleSelectUser(item.entry_id)} - > - -

- {item.author_display_name} -

-

- {item.author_channel_id.slice(0, 20)}... -

- - - {item.comment_count} - - - - - {item.my_annotated_count}/{item.comment_count} - - - + {datasetUsers.items.map((item) => ( + handleSelectUser(item.entry_id)} + > + +

+ {item.author_display_name} +

+

+ {item.author_channel_id.slice(0, 20)}... +

+ + + {item.comment_count} + + + {item.my_label ? ( - {done ? "Concluído" : `${item.my_pending_count} pendentes`} + {item.my_label === "bot" ? "Bot" : "Humano"} - - - ); - })} + ) : null} + + + + {item.is_annotated_by_me ? "Concluído" : "Pendente"} + + + + ))} @@ -588,7 +590,7 @@ export function AnnotatePage() { {" "} contendo um array de objetos com{" "} - comment_db_id + entry_id ,{" "} @@ -685,7 +687,7 @@ export function AnnotatePage() { {p.dataset_name} - {p.annotated}/{p.total_comments} + {p.annotated}/{p.total_users} @@ -709,7 +711,7 @@ export function AnnotatePage() { {p.dataset_name} - {p.annotated}/{p.total_comments} + {p.annotated}/{p.total_users} diff --git a/frontend/src/pages/Annotate/CommentAnnotationRow.tsx b/frontend/src/pages/Annotate/CommentAnnotationRow.tsx deleted file mode 100644 index 7247f79..0000000 --- a/frontend/src/pages/Annotate/CommentAnnotationRow.tsx +++ /dev/null @@ -1,281 +0,0 @@ -import { useCallback, useEffect, useRef, useState } from "react"; -import type { CommentWithAnnotation } from "../../api/annotate"; - -interface Props { - comment: CommentWithAnnotation; - focused: boolean; - onAnnotate: ( - commentDbId: string, - label: "bot" | "humano", - justificativa?: string | null - ) => Promise; - onFocus: () => void; - readOnly?: boolean; -} - -export function CommentAnnotationRow({ - comment, - focused, - onAnnotate, - onFocus, - readOnly = false, -}: Props) { - const [showJustificativa, setShowJustificativa] = useState(false); - const [justificativa, setJustificativa] = useState(comment.my_annotation?.justificativa ?? ""); - const [saving, setSaving] = useState(false); - const rowRef = useRef(null); - - const currentLabel = comment.my_annotation?.label ?? null; - - useEffect(() => { - if (focused && rowRef.current) { - rowRef.current.scrollIntoView({ behavior: "smooth", block: "nearest" }); - } - }, [focused]); - - const handleHumano = useCallback(async () => { - setSaving(true); - setShowJustificativa(false); - await onAnnotate(comment.comment_db_id, "humano", null); - setSaving(false); - }, [comment.comment_db_id, onAnnotate]); - - const handleBotClick = useCallback(() => { - setShowJustificativa(true); - onFocus(); - }, [onFocus]); - - const handleBotConfirm = useCallback(async () => { - if (!justificativa.trim()) return; - setSaving(true); - await onAnnotate(comment.comment_db_id, "bot", justificativa.trim()); - setShowJustificativa(false); - setSaving(false); - }, [comment.comment_db_id, justificativa, onAnnotate]); - - // Atalhos de teclado - useEffect(() => { - if (!focused) return; - const handler = (e: KeyboardEvent) => { - if (e.target instanceof HTMLTextAreaElement || e.target instanceof HTMLInputElement) return; - if (e.key === "h" || e.key === "H") { - e.preventDefault(); - void handleHumano(); - } else if (e.key === "b" || e.key === "B") { - e.preventDefault(); - handleBotClick(); - } - }; - window.addEventListener("keydown", handler); - return () => window.removeEventListener("keydown", handler); - }, [focused, handleHumano, handleBotClick]); - - const date = new Date(comment.published_at).toLocaleDateString("pt-BR", { - day: "2-digit", - month: "2-digit", - year: "numeric", - hour: "2-digit", - minute: "2-digit", - }); - - return ( -
- {/* Texto do comentário */} -

- {comment.text_original} -

- - {/* Metadados */} -
- {date} - {comment.like_count} curtidas - {comment.reply_count} respostas -
- - {/* Badge atual + botões */} -
- {currentLabel && ( - - {currentLabel === "bot" ? "Bot" : "Humano"} - - )} - - {!readOnly && ( -
- - -
- )} -
- - {/* Campo justificativa (inline) */} - {showJustificativa && ( -
-