diff --git a/.gitignore b/.gitignore index c70392b..9c31d93 100644 --- a/.gitignore +++ b/.gitignore @@ -202,4 +202,6 @@ test/ /Scripts/rag.sh /Scripts/cfg/m3docVQA.yaml /Scripts/cfg/MMLongBench.yaml -/Scripts/cfg/Qasper.yaml \ No newline at end of file +/Scripts/cfg/Qasper.yaml + +.history/ \ No newline at end of file diff --git a/Core/Index/GBCIndex.py b/Core/Index/GBCIndex.py index f40c4c4..2247f76 100644 --- a/Core/Index/GBCIndex.py +++ b/Core/Index/GBCIndex.py @@ -1,5 +1,8 @@ -from sympy import N -from Core.Index.Tree import * +import logging +import os +from typing import Optional + +from Core.Index.Tree import DocumentTree from Core.configs.system_config import SystemConfig from Core.provider.llm import LLM from Core.Index.Graph import Graph @@ -7,6 +10,9 @@ from Core.provider.vdb import VectorStore +log = logging.getLogger(__name__) + + class GBC: """ A class representing the index combining graph and tree structures. @@ -18,7 +24,7 @@ def __init__( self, config: SystemConfig, graph_index: Optional[Graph] = None, - TreeIndex: Optional[DocumentTree] = None, + tree_index: Optional[DocumentTree] = None, ): """ Initializes the TreeIndex with an optional index. @@ -28,21 +34,29 @@ def __init__( self.save_dir = config.save_path self.config = config self.llm = LLM(config.llm) - self.TreeIndex: DocumentTree = TreeIndex + self.TreeIndex: DocumentTree = tree_index self.GraphIndex: Graph = graph_index - # load the vdb of entities + # load the vdb of entities — namespaced by tenant/doc if available if config.graph.refine_type == "basic": - self.entity_vdb_path = os.path.join(self.save_dir, "kg_vdb_basic") + vdb_name = "kg_vdb_basic" else: - self.entity_vdb_path = os.path.join(self.save_dir, "kg_vdb") - + vdb_name = "kg_vdb" + + if config.tenant_id and config.doc_id: + self.entity_vdb_path = os.path.join( + self.save_dir, config.tenant_id, config.doc_id, vdb_name + ) + else: + self.entity_vdb_path = os.path.join(self.save_dir, vdb_name) + self.embedder = TextEmbeddingProvider( model_name=config.graph.embedding_config.model_name, backend=config.graph.embedding_config.backend, max_length=config.graph.embedding_config.max_length, device=config.graph.embedding_config.device, api_base=config.graph.embedding_config.api_base, + api_key=config.graph.embedding_config.api_key, ) self.entity_vdb: VectorStore = VectorStore( db_path=self.entity_vdb_path, @@ -64,7 +78,7 @@ def save_gbc_index(self): # vdb is saved automatically when the entity_vdb is created - log.info(f"GBC index saved") + log.info("GBC index saved") def rebuild_vdb(self): """ @@ -83,12 +97,7 @@ def rebuild_vdb(self): texts.append(node) entity = self.GraphIndex.get_entity_by_node_name(node) - tmp_dict = { - "entity_name": entity.entity_name, - "entity_type": entity.entity_type, - "description": entity.description, - } - meta_datas.append(tmp_dict) + meta_datas.append(entity.to_vdb_metadata()) self.entity_vdb.add_texts(texts=texts, metadatas=meta_datas) log.info(f"Rebuilt entity VDB with {len(texts)} entries.") @@ -104,13 +113,47 @@ def load_gbc_index(cls, config: SystemConfig): tree_index = DocumentTree.load_from_file( DocumentTree.get_save_path(config.save_path) ) - + if config.graph.refine_type == "basic": variant = "basic" else: variant = None - - graph_index = Graph.load_from_dir(config.save_path, variant=variant) - GBC = cls(config=config, graph_index=graph_index, TreeIndex=tree_index) + + # Pass FalkorDB config if tenant/doc IDs are set + falkordb_cfg = config.falkordb if (config.tenant_id and config.doc_id) else None + graph_index = Graph.load_from_dir( + config.save_path, + variant=variant, + tenant_id=config.tenant_id, + doc_id=config.doc_id, + falkordb_cfg=falkordb_cfg, + ) + GBC = cls(config=config, graph_index=graph_index, tree_index=tree_index) log.info(f"GBC index loaded from {config.save_path}") return GBC + + def rebuild_global_vdb(self, global_vdb_path: str) -> None: + """ + Build a global vector database of canonical entities (Phase 3). + Pulls all nodes from the current GraphIndex and adds them to a shared VDB. + """ + from Core.provider.vdb import VectorStore + global_vdb = VectorStore( + db_path=global_vdb_path, + embedding_model=self.embedder, + collection_name="global_kg_collection", + ) + nodes = self.GraphIndex.get_all_nodes() + texts = [] + meta_datas = [] + for node in nodes: + texts.append(node) + entity = self.GraphIndex.get_entity_by_node_name(node) + metadata = entity.to_vdb_metadata() + metadata["doc_id"] = self.config.doc_id or "" + metadata["tenant_id"] = self.config.tenant_id or "" + meta_datas.append(metadata) + global_vdb.add_texts(texts=texts, metadatas=meta_datas) + log.info( + f"Rebuilt global VDB with {len(texts)} entries from doc '{self.config.doc_id}'." + ) diff --git a/Core/Index/Graph.py b/Core/Index/Graph.py index ccdb7ce..364b277 100644 --- a/Core/Index/Graph.py +++ b/Core/Index/Graph.py @@ -2,8 +2,7 @@ from networkx.readwrite import json_graph import os from collections import defaultdict -from typing import Iterable, Union, Set, List -from numpy import source +from typing import Iterable, Union, Set, List, Optional, TYPE_CHECKING # noqa: F401 from pydantic import BaseModel, Field import json @@ -11,11 +10,20 @@ log = logging.getLogger(__name__) +if TYPE_CHECKING: + from Core.configs.falkordb_config import FalkorDBConfig + class Entity(BaseModel): entity_name: str # Primary key for entity entity_type: str = Field(default="") # Entity type description: str = Field(default="") # The description of this entity + entity_id: str = Field(default="") # Stable ontology/canonical identifier + canonical_id: str = Field(default="") # Points to canonical ontology identifier + entity_role: str = Field(default="provisional") # canonical / provisional + aliases: List[str] = Field(default_factory=list) # Known aliases for resolution + mapping_confidence: float = Field(default=0.0) # Ontology mapping confidence + ontology_source: str = Field(default="") # Source of ontology metadata source_ids: Set[int] = Field( default_factory=set ) # Set of source IDs from which this entity is derived @@ -37,6 +45,20 @@ def __eq__(self, other): ) return False + def to_vdb_metadata(self) -> dict: + aliases = list(dict.fromkeys(self.aliases)) + return { + "entity_name": self.entity_name, + "entity_type": self.entity_type, + "description": self.description, + "entity_id": self.entity_id, + "canonical_id": self.canonical_id, + "entity_role": self.entity_role, + "mapping_confidence": float(self.mapping_confidence or 0.0), + "ontology_source": self.ontology_source, + "aliases_json": json.dumps(aliases, ensure_ascii=False), + } + class Relationship(BaseModel): src_entity_name: str # Name of the entity on the left side of the edge @@ -64,14 +86,30 @@ class Graph: _DATA_FILE = "graph_data.json" # index data file _BASE_FILENAME = "graph_data" - def __init__(self, save_path: str = None, variant: str = None): + def __init__( + self, + save_path: str = None, + variant: str = None, + tenant_id: str = None, + doc_id: str = None, + falkordb_cfg=None, # Optional[FalkorDBConfig] + ): self.kg = nx.Graph() # 节点名采用 "entity_name (entity_type)",确保唯一性 self.tree2kg = defaultdict(set) # Maps tree nodes id (int) to graph entities - # self.name_to_nodes = defaultdict(set) # entity_name -> set of node names self.save_dir = save_path self.variant = variant + # Multi-tenant FalkorDB support + self.tenant_id = tenant_id + self.doc_id = doc_id + self.falkordb_cfg = falkordb_cfg + self.use_falkordb = falkordb_cfg is not None and tenant_id is not None and doc_id is not None + self._fdb_graph = None # lazy FalkorDB graph handle + self._fdb_graph_name: Optional[str] = None + if self.use_falkordb: + self._fdb_graph_name = falkordb_cfg.graph_name_for_doc(tenant_id, doc_id) + # dynamic filename based on variant self.data_filename = self._get_filename(variant) @@ -107,7 +145,6 @@ def add_kg_node(self, entity: Entity) -> None: node_name = self.get_node_name_from_entity(entity) self.kg.add_node(node_name, **entity.model_dump()) - # self.name_to_nodes[entity.entity_name].add(node_name) def add_kg_edge(self, rel: Relationship, src_type: str, tgt_type: str) -> None: """Add a relation/edge between two KG entities with all its attributes.""" @@ -141,11 +178,20 @@ def add_and_link( entities = [entities] for entity in entities: node_name = self.get_node_name_from_entity(entity) - # node_name = f"{entity.entity_name} ({entity.entity_type})" if node_name not in self.kg: self.add_kg_node(entity) self.link(tree_node_id, entity.entity_name, entity.entity_type) + def _rewrite_edge_entity_names( + self, edge_data: Optional[dict], old_entity_name: str, new_entity_name: str + ) -> dict: + updated_edge_data = dict(edge_data or {}) + if updated_edge_data.get("src_entity_name") == old_entity_name: + updated_edge_data["src_entity_name"] = new_entity_name + if updated_edge_data.get("tgt_entity_name") == old_entity_name: + updated_edge_data["tgt_entity_name"] = new_entity_name + return updated_edge_data + def update_entity( self, old_entity_name: str, old_entity_type: str, new_entity: Entity ) -> None: @@ -166,9 +212,17 @@ def update_entity( if new_node_name != old_node_name: # 1. Add new node and copy all edges self.kg.add_node(new_node_name, **new_entity.model_dump()) - for neighbor in list(self.kg.neighbors(old_node_name)): + for neighbor in self.kg.neighbors(old_node_name): edge_data = self.kg.get_edge_data(old_node_name, neighbor) - self.kg.add_edge(new_node_name, neighbor, **edge_data) + self.kg.add_edge( + new_node_name, + neighbor, + **self._rewrite_edge_entity_names( + edge_data=edge_data, + old_entity_name=old_entity_name, + new_entity_name=new_entity.entity_name, + ), + ) # 2.1 update tree2kg for tree_id in new_source_ids: # If the old node is in the tree2kg, remove the old name @@ -198,7 +252,6 @@ def get_entity(self, entity_name: str, entity_type: str = "") -> Entity: node_name = self.get_node_name_from_str( entity_name=entity_name, entity_type=entity_type ) - # node_name = f"{entity_name} ({entity_type})" if node_name not in self.kg.nodes: raise KeyError(f"Entity '{node_name}' not found in knowledge graph.") return Entity(**self.kg.nodes[node_name]) @@ -217,27 +270,9 @@ def get_entity_by_node_name(self, node_name: str) -> Entity: raise KeyError(f"Node '{node_name}' not found in knowledge graph.") return Entity(**self.kg.nodes[node_name]) - def get_kg_subgraph( - self, tree_node_ids: Iterable[int], copy: bool = True - ) -> nx.Graph: - """ - Given one or more tree node IDs, return the induced subgraph of the KG - containing all linked entities. By default returns a deep copy; if copy=False, - returns a lightweight view (faster slicing). - - Complexity: O(sum(degree(n)) + |nodes| + |edges|). - For a few hundred nodes, this remains efficient even if KG has millions of edges. - """ - # Collect all KG node names for the provided tree nodes - kg_nodes = set().union(*(self.tree2kg.get(tid, set()) for tid in tree_node_ids)) - sub = self.kg.subgraph(kg_nodes) - return sub.copy() if copy else sub - def get_subgraph_data(self, entities: List[str]) -> dict: - # Return the subgraph entities data, excluding description and source_ids in entities - # If the relation connects two entities in the subgraph, it will be included + """Return lightweight node data for the subgraph induced by `entities`.""" subgraph = self.kg.subgraph(entities) - # data = {"nodes": [], "edges": []} data = {"nodes": []} for node in subgraph.nodes(data=True): node_data = { @@ -245,44 +280,29 @@ def get_subgraph_data(self, entities: List[str]) -> dict: "entity_type": node[1]["entity_type"], } data["nodes"].append(node_data) - # for edge in subgraph.edges(data=True): - # edge_data = { - # "src_entity_name": edge[2]["src_entity_name"], - # "tgt_entity_name": edge[2]["tgt_entity_name"], - # "relation_name": edge[2]["relation_name"], - # "weight": edge[2]["weight"], - # } - # data["edges"].append(edge_data) return data - def Entities2TreeNodes(self, entities: List[Entity]) -> List[int]: + def entities_to_tree_nodes(self, entities: List[Entity]) -> List[int]: """ Given KG node names, return all tree node IDs that link to them. """ result = set() for ent in entities: - source_ids = ent.source_ids - result.union(source_ids) - result = list(result) - return result + result.update(ent.source_ids) + return sorted(result) - def Entity2TreeNodes(self, ent: Entity) -> List[int]: + def entity_to_tree_nodes(self, ent: Entity) -> List[int]: """ Given an Entity object, return all tree node IDs that link to it. """ - res = ent.source_ids - res = list(res) - return res + return sorted(ent.source_ids) - def NodeName2TreeNodes(self, node_name: str) -> Set[int]: + def node_name_to_tree_nodes(self, node_name: str) -> List[int]: """ Given a node name (entity_name (entity_type)), return all tree node IDs that link to it. """ ent = self.get_entity_by_node_name(node_name) - res = ent.source_ids - res = list(res) - - return res + return sorted(ent.source_ids) def remove_self_loops(self) -> int: """ @@ -301,51 +321,279 @@ def remove_self_loops(self) -> int: self.kg.remove_edges_from(self_loop_edges) log.info("All self-loops have been removed.") + # ------------------------------------------------------------------ # + # FalkorDB helpers # + # ------------------------------------------------------------------ # + + def _get_fdb_graph(self): + """Lazy-initialise and return the FalkorDB graph handle.""" + if self._fdb_graph is not None: + return self._fdb_graph + try: + from falkordb import FalkorDB + cfg = self.falkordb_cfg + conn_kwargs = {"host": cfg.host, "port": cfg.port} + if cfg.username: + conn_kwargs["username"] = cfg.username + if cfg.password: + conn_kwargs["password"] = cfg.password + client = FalkorDB(**conn_kwargs) + self._fdb_graph = client.select_graph(self._fdb_graph_name) + log.info(f"Connected to FalkorDB graph '{self._fdb_graph_name}'") + except Exception as e: + log.error(f"Failed to connect to FalkorDB: {e}") + raise + return self._fdb_graph + + def _save_to_falkordb(self) -> None: + """Persist the in-memory NetworkX graph to FalkorDB.""" + g = self._get_fdb_graph() + + def _esc(value) -> str: + return str(value or "").replace("\\", "\\\\").replace("'", "\\'") + + # Clear existing data for idempotent saves + try: + g.query("MATCH (n) DETACH DELETE n") + except Exception: + pass + + # Write nodes + for node_name, data in self.kg.nodes(data=True): + source_ids_list = list(data.get("source_ids", set())) + desc = _esc(data.get("description", "")) + ename = _esc(data.get("entity_name", "")) + etype = _esc(data.get("entity_type", "")) + entity_id = _esc(data.get("entity_id", "")) + canonical_id = _esc(data.get("canonical_id", "")) + entity_role = _esc(data.get("entity_role", "provisional")) + ontology_source = _esc(data.get("ontology_source", "")) + aliases_json = _esc(json.dumps(data.get("aliases", []), ensure_ascii=False)) + mapping_confidence = float(data.get("mapping_confidence", 0.0) or 0.0) + nname = _esc(node_name) + cypher = ( + f"CREATE (n:Entity {{" + f"node_name: '{nname}', " + f"entity_name: '{ename}', " + f"entity_type: '{etype}', " + f"entity_id: '{entity_id}', " + f"canonical_id: '{canonical_id}', " + f"entity_role: '{entity_role}', " + f"aliases_json: '{aliases_json}', " + f"mapping_confidence: {mapping_confidence}, " + f"ontology_source: '{ontology_source}', " + f"description: '{desc}', " + f"source_ids: {source_ids_list}" + f"}})" + ) + g.query(cypher) + + # Write edges + for src, tgt, data in self.kg.edges(data=True): + rel_name = data.get("relation_name", "").replace("'", "\\'") + weight = float(data.get("weight", 0.0)) + desc = data.get("description", "").replace("'", "\\'") + src_ids = list(data.get("source_ids", set())) + src_q = src.replace("\\", "\\\\").replace("'", "\\'") + tgt_q = tgt.replace("\\", "\\\\").replace("'", "\\'") + cypher = ( + f"MATCH (a:Entity {{node_name: '{src_q}'}}), " + f"(b:Entity {{node_name: '{tgt_q}'}}) " + f"CREATE (a)-[:RELATION {{" + f"relation_name: '{rel_name}', " + f"weight: {weight}, " + f"description: '{desc}', " + f"source_ids: {src_ids}" + f"}}]->(b)" + ) + g.query(cypher) + + # Write tree2kg as node property (source_ids already on nodes) + log.info(f"Saved graph to FalkorDB '{self._fdb_graph_name}': " + f"{self.kg.number_of_nodes()} nodes, {self.kg.number_of_edges()} edges.") + + def _load_from_falkordb(self) -> None: + """Load graph data from FalkorDB into in-memory NetworkX graph.""" + g = self._get_fdb_graph() + result = g.query("MATCH (n:Entity) RETURN n") + for record in result.result_set: + node = record[0] + props = node.properties + source_ids = set(props.get("source_ids", [])) + node_name = props["node_name"] + aliases_json = props.get("aliases_json", "[]") + try: + aliases = json.loads(aliases_json) if isinstance(aliases_json, str) else list(aliases_json or []) + except json.JSONDecodeError: + aliases = [] + self.kg.add_node(node_name, + entity_name=props.get("entity_name", ""), + entity_type=props.get("entity_type", ""), + entity_id=props.get("entity_id", ""), + canonical_id=props.get("canonical_id", ""), + entity_role=props.get("entity_role", "provisional"), + aliases=aliases, + mapping_confidence=float(props.get("mapping_confidence", 0.0) or 0.0), + ontology_source=props.get("ontology_source", ""), + description=props.get("description", ""), + source_ids=source_ids) + for tid in source_ids: + self.tree2kg[int(tid)].add(node_name) + + edge_result = g.query( + "MATCH (a:Entity)-[r:RELATION]->(b:Entity) " + "RETURN a.node_name, b.node_name, r.relation_name, r.weight, r.description, r.source_ids" + ) + for rec in edge_result.result_set: + src_name, tgt_name, rel_name, weight, desc, src_ids = rec + self.kg.add_edge( + src_name, tgt_name, + src_entity_name=self.kg.nodes[src_name].get("entity_name", ""), + tgt_entity_name=self.kg.nodes[tgt_name].get("entity_name", ""), + relation_name=rel_name or "", + weight=float(weight or 0.0), + description=desc or "", + source_ids=set(src_ids or []), + ) + log.info(f"Loaded graph from FalkorDB '{self._fdb_graph_name}': " + f"{self.kg.number_of_nodes()} nodes, {self.kg.number_of_edges()} edges.") + + def _get_fdb_subgraph(self, tree_node_ids: Iterable[int]) -> nx.Graph: + """Query FalkorDB for the subgraph linked to given tree node IDs.""" + # Collect node names from tree2kg (loaded at init) + kg_nodes = set().union(*(self.tree2kg.get(tid, set()) for tid in tree_node_ids)) + if not kg_nodes: + return nx.Graph() + + g = self._get_fdb_graph() + node_list = [n.replace("'", "\\'") for n in kg_nodes] + node_filter = "['" + "', '".join(node_list) + "']" + result = g.query( + f"MATCH (n:Entity) WHERE n.node_name IN {node_filter} RETURN n" + ) + subgraph = nx.Graph() + for rec in result.result_set: + node = rec[0] + props = node.properties + node_name = props["node_name"] + aliases_json = props.get("aliases_json", "[]") + try: + aliases = json.loads(aliases_json) if isinstance(aliases_json, str) else list(aliases_json or []) + except json.JSONDecodeError: + aliases = [] + subgraph.add_node(node_name, + entity_name=props.get("entity_name", ""), + entity_type=props.get("entity_type", ""), + entity_id=props.get("entity_id", ""), + canonical_id=props.get("canonical_id", ""), + entity_role=props.get("entity_role", "provisional"), + aliases=aliases, + mapping_confidence=float(props.get("mapping_confidence", 0.0) or 0.0), + ontology_source=props.get("ontology_source", ""), + description=props.get("description", ""), + source_ids=set(props.get("source_ids", []))) + + edge_result = g.query( + f"MATCH (a:Entity)-[r:RELATION]->(b:Entity) " + f"WHERE a.node_name IN {node_filter} AND b.node_name IN {node_filter} " + f"RETURN a.node_name, b.node_name, r.relation_name, r.weight, r.description, r.source_ids" + ) + for rec in edge_result.result_set: + src_name, tgt_name, rel_name, weight, desc, src_ids = rec + if src_name in subgraph and tgt_name in subgraph: + subgraph.add_edge(src_name, tgt_name, + relation_name=rel_name or "", + weight=float(weight or 0.0), + description=desc or "", + source_ids=set(src_ids or [])) + return subgraph + def save_graph(self) -> None: - if not self.save_dir: + if not self.save_dir and not self.use_falkordb: log.warning("Warning: save_dir is not set. Nothing will be saved.") return - os.makedirs(self.save_dir, exist_ok=True) - # save_path = os.path.join(self.save_dir, self._DATA_FILE) - - # use dynamic filename based on variant - save_path = os.path.join(self.save_dir, self.data_filename) - - graph_json_data = json_graph.node_link_data(self.kg, edges="links") - - data_to_save = { - "graph": graph_json_data, - "tree2kg": {k: list(v) for k, v in self.tree2kg.items()}, - "variant": self.variant, - } - - # 3. 保存为格式化的JSON文件 - with open(save_path, "w", encoding="utf-8") as f: - json.dump(data_to_save, f, cls=SetEncoder, indent=4, ensure_ascii=False) + # If FalkorDB is configured, persist there + if self.use_falkordb: + self._save_to_falkordb() + # Also save JSON as backup if save_dir is set + if self.save_dir: + os.makedirs(self.save_dir, exist_ok=True) + + if self.save_dir: + os.makedirs(self.save_dir, exist_ok=True) + save_path = os.path.join(self.save_dir, self.data_filename) + graph_json_data = json_graph.node_link_data(self.kg, edges="links") + data_to_save = { + "graph": graph_json_data, + "tree2kg": {k: list(v) for k, v in self.tree2kg.items()}, + "variant": self.variant, + } + with open(save_path, "w", encoding="utf-8") as f: + json.dump(data_to_save, f, cls=SetEncoder, indent=4, ensure_ascii=False) + log.info(f"Graph data successfully saved to: {save_path}") - log.info(f"Graph data successfully saved to: {save_path}") + def get_kg_subgraph( + self, tree_node_ids: Iterable[int], copy: bool = True + ) -> nx.Graph: + """ + Given one or more tree node IDs, return the induced subgraph of the KG. + In FalkorDB mode, queries the database directly for only the needed nodes. + """ + if self.use_falkordb and not self.kg.nodes: + # FalkorDB-only mode: fetch subgraph from DB + return self._get_fdb_subgraph(tree_node_ids) + # Default: in-memory NetworkX subgraph + kg_nodes = set().union(*(self.tree2kg.get(tid, set()) for tid in tree_node_ids)) + sub = self.kg.subgraph(kg_nodes) + return sub.copy() if copy else sub @classmethod - def load_from_dir(cls, load_dir: str, variant: str = None) -> "Graph": + def load_from_dir( + cls, + load_dir: str, + variant: str = None, + tenant_id: str = None, + doc_id: str = None, + falkordb_cfg=None, + ) -> "Graph": + """Load a Graph from JSON file, or from FalkorDB if configured.""" + # FalkorDB mode: load tree2kg from DB, keep kg empty for lazy subgraph queries + if falkordb_cfg is not None and tenant_id is not None and doc_id is not None: + graph_instance = cls( + save_path=load_dir, + variant=variant, + tenant_id=tenant_id, + doc_id=doc_id, + falkordb_cfg=falkordb_cfg, + ) + graph_instance._load_from_falkordb() + log.info(f"Graph loaded from FalkorDB graph '{graph_instance._fdb_graph_name}'") + return graph_instance + + # Default: JSON file load target_filename = cls._get_filename(variant) load_path = os.path.join(load_dir, target_filename) - - # load_path = os.path.join(load_dir, cls._DATA_FILE) if not os.path.exists(load_path): raise FileNotFoundError(f"Error: Missing graph file: {load_path}") with open(load_path, "r", encoding="utf-8") as f: loaded_data = json.load(f) - graph_instance = cls(save_path=load_dir) - - graph_instance.kg = json_graph.node_link_graph(loaded_data["graph"]) + graph_instance = cls(save_path=load_dir, variant=variant) + # Pass edges="links" to match the key used when saving with node_link_data(edges="links") + graph_instance.kg = json_graph.node_link_graph(loaded_data["graph"], edges="links") for _, node_data in graph_instance.kg.nodes(data=True): if "source_ids" in node_data and isinstance(node_data["source_ids"], list): node_data["source_ids"] = set(node_data["source_ids"]) + node_data.setdefault("entity_id", "") + node_data.setdefault("canonical_id", "") + node_data.setdefault("entity_role", "provisional") + node_data.setdefault("aliases", []) + node_data.setdefault("mapping_confidence", 0.0) + node_data.setdefault("ontology_source", "") for _, _, edge_data in graph_instance.kg.edges(data=True): if "source_ids" in edge_data and isinstance(edge_data["source_ids"], list): @@ -361,6 +609,108 @@ def load_from_dir(cls, load_dir: str, variant: str = None) -> "Graph": ) return graph_instance + # ------------------------------------------------------------------ # + # Phase 3: Global graph methods # + # ------------------------------------------------------------------ # + + def save_to_global_graph(self, falkordb_cfg, tenant_id: str) -> None: + """ + Merge this document's KG nodes into the tenant-level global FalkorDB graph. + Each entity is stored as a canonical node; a HAS_MENTION edge links the + canonical node back to its document source. + """ + try: + from falkordb import FalkorDB + cfg = falkordb_cfg + conn_kwargs = {"host": cfg.host, "port": cfg.port} + if cfg.username: + conn_kwargs["username"] = cfg.username + if cfg.password: + conn_kwargs["password"] = cfg.password + client = FalkorDB(**conn_kwargs) + global_graph = client.select_graph(cfg.graph_name_for_global(tenant_id)) + except Exception as e: + log.error(f"Failed to connect to FalkorDB global graph: {e}") + raise + + def _esc(value) -> str: + return str(value or "").replace("\\", "\\\\").replace("'", "\\'") + + for node_name, data in self.kg.nodes(data=True): + ename = _esc(data.get("entity_name", "")) + etype = _esc(data.get("entity_type", "")) + desc = _esc(data.get("description", "")) + entity_id = _esc(data.get("entity_id", "")) + canonical_id = _esc(data.get("canonical_id", "")) + entity_role = _esc(data.get("entity_role", "provisional")) + ontology_source = _esc(data.get("ontology_source", "")) + aliases_json = _esc(json.dumps(data.get("aliases", []), ensure_ascii=False)) + mapping_confidence = float(data.get("mapping_confidence", 0.0) or 0.0) + nname = _esc(node_name) + doc_id_esc = _esc(self.doc_id) + # MERGE canonical entity node + global_graph.query( + f"MERGE (n:Entity {{node_name: '{nname}'}}) " + f"ON CREATE SET n.entity_name='{ename}', n.entity_type='{etype}', n.description='{desc}', " + f"n.entity_id='{entity_id}', n.canonical_id='{canonical_id}', n.entity_role='{entity_role}', " + f"n.aliases_json='{aliases_json}', n.mapping_confidence={mapping_confidence}, " + f"n.ontology_source='{ontology_source}' " + f"CREATE (n)-[:HAS_MENTION {{doc_id: '{doc_id_esc}'}}]->(n)" + ) + log.info(f"Merged {self.kg.number_of_nodes()} nodes into global graph for tenant '{tenant_id}'.") + + def get_global_subgraph( + self, + falkordb_cfg, + tenant_id: str, + accessible_doc_ids: List[str], + ) -> nx.Graph: + """ + Fetch a cross-document subgraph from the global FalkorDB graph, + restricted to documents in accessible_doc_ids. + """ + try: + from falkordb import FalkorDB + cfg = falkordb_cfg + conn_kwargs = {"host": cfg.host, "port": cfg.port} + if cfg.username: + conn_kwargs["username"] = cfg.username + if cfg.password: + conn_kwargs["password"] = cfg.password + client = FalkorDB(**conn_kwargs) + global_graph = client.select_graph(cfg.graph_name_for_global(tenant_id)) + except Exception as e: + log.error(f"Failed to connect to FalkorDB global graph: {e}") + raise + + # Use a relationship variable r to filter by doc_id property on HAS_MENTION edges + doc_filter = "['" + "', '".join(d.replace("'", "\\'") for d in accessible_doc_ids) + "']" + result = global_graph.query( + f"MATCH (n:Entity)-[r:HAS_MENTION]->(n) " + f"WHERE r.doc_id IN {doc_filter} RETURN DISTINCT n" + ) + subgraph = nx.Graph() + for rec in result.result_set: + node = rec[0] + props = node.properties + node_name = props["node_name"] + aliases_json = props.get("aliases_json", "[]") + try: + aliases = json.loads(aliases_json) if isinstance(aliases_json, str) else list(aliases_json or []) + except json.JSONDecodeError: + aliases = [] + subgraph.add_node(node_name, + entity_name=props.get("entity_name", ""), + entity_type=props.get("entity_type", ""), + entity_id=props.get("entity_id", ""), + canonical_id=props.get("canonical_id", ""), + entity_role=props.get("entity_role", "provisional"), + aliases=aliases, + mapping_confidence=float(props.get("mapping_confidence", 0.0) or 0.0), + ontology_source=props.get("ontology_source", ""), + description=props.get("description", "")) + return subgraph + if __name__ == "__main__": # Example usage diff --git a/Core/configs/docling_config.py b/Core/configs/docling_config.py new file mode 100644 index 0000000..b024059 --- /dev/null +++ b/Core/configs/docling_config.py @@ -0,0 +1,37 @@ +from dataclasses import dataclass, field +from typing import List + + +@dataclass +class DoclingConfig: + """Configuration for the Docling document parser. + + This config is an alternative to MinerU. Select it in the top-level YAML + by setting ``parser: docling``. + + Attributes: + ocr_engine: OCR back-end to use. Accepted values: + ``"easyocr"`` (default), ``"tesseract"``, ``"rapidocr"``. + force_full_page_ocr: When *True* every page is passed through OCR + even if selectable text is present. Strongly recommended for + scanned documents. + images_scale: Render scale factor for page images (1.0 ≈ 72 DPI). + Increase to 2.0 for higher-resolution figure/table crops. + lang: ISO 639-1 language hint forwarded to the OCR engine. + """ + + ocr_engine: str = "easyocr" + force_full_page_ocr: bool = False + images_scale: float = 2.0 + lang: str = "en" + + def __post_init__(self): + valid_engines = ("easyocr", "tesseract", "rapidocr") + if self.ocr_engine not in valid_engines: + raise ValueError( + f"Unsupported ocr_engine: '{self.ocr_engine}'. " + f"Choose one of {valid_engines}." + ) + if self.images_scale <= 0: + raise ValueError(f"images_scale must be positive, got {self.images_scale}.") + diff --git a/Core/configs/embedding_config.py b/Core/configs/embedding_config.py index b7ec689..16b4dc7 100644 --- a/Core/configs/embedding_config.py +++ b/Core/configs/embedding_config.py @@ -1,5 +1,10 @@ +import os from dataclasses import dataclass +from dotenv import load_dotenv + +load_dotenv() + @dataclass class EmbeddingConfig: @@ -10,8 +15,16 @@ class EmbeddingConfig: model_name: str = "Qwen/Qwen3-Embedding-0.6B" max_length: int = 8192 device: str = "cuda:2" - - + + def __post_init__(self): if self.backend not in ["local", "ollama", "openai"]: raise ValueError(f"Unsupported backend: {self.backend}") + # Resolve 'env' placeholder → read from DASHSCOPE_API_KEY environment variable + if self.api_key == "env": + self.api_key = os.environ.get("DASHSCOPE_API_KEY", "") + if not self.api_key: + raise ValueError( + "EmbeddingConfig.api_key is 'env' but DASHSCOPE_API_KEY " + "environment variable is not set." + ) diff --git a/Core/configs/entity_resolution_config.py b/Core/configs/entity_resolution_config.py new file mode 100644 index 0000000..14deab1 --- /dev/null +++ b/Core/configs/entity_resolution_config.py @@ -0,0 +1,13 @@ +from pydantic import BaseModel, Field + + +class EntityResolutionConfig(BaseModel): + """Tenant/global canonical entity resolution settings.""" + + enabled: bool = False + similarity_threshold: float = Field(default=0.85, ge=0.0, le=1.0) + top_k: int = Field(default=1, ge=1, le=20) + global_vdb_dir: str = "./indices" + collection_name: str = "global_kg_collection" + canonical_only: bool = False + sync_to_global_graph: bool = False \ No newline at end of file diff --git a/Core/configs/falkordb_config.py b/Core/configs/falkordb_config.py new file mode 100644 index 0000000..306fff9 --- /dev/null +++ b/Core/configs/falkordb_config.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass, field + + +@dataclass +class FalkorDBConfig: + """Configuration for FalkorDB graph database connection.""" + + host: str = "localhost" + port: int = 6379 + username: str = "" + password: str = "" + graph_prefix: str = "bookrag" + + def graph_name_for_doc(self, tenant_id: str, doc_id: str) -> str: + """Return the FalkorDB graph name for a per-document KG.""" + return f"{self.graph_prefix}:{tenant_id}:doc:{doc_id}" + + def graph_name_for_global(self, tenant_id: str) -> str: + """Return the FalkorDB graph name for a tenant-level global KG.""" + return f"{self.graph_prefix}:{tenant_id}:global" + diff --git a/Core/configs/llm_config.py b/Core/configs/llm_config.py index 79a2c8b..7aed0cb 100644 --- a/Core/configs/llm_config.py +++ b/Core/configs/llm_config.py @@ -1,4 +1,8 @@ +import os from dataclasses import dataclass +from dotenv import load_dotenv + +load_dotenv() # ensure .env is loaded when used outside the API server @dataclass class LLMConfig: @@ -15,3 +19,13 @@ class LLMConfig: def __post_init__(self): if self.backend not in ["openai", "ollama"]: raise ValueError(f"Unsupported backend: {self.backend}") + # Allow api_key to be resolved from environment variable + if not self.api_key or self.api_key in ("env", "ENV"): + env_key = os.environ.get("CHAT_API_KEY", "") or os.environ.get("DASHSCOPE_API_KEY", "") + if env_key: + self.api_key = env_key + else: + raise ValueError( + "LLM api_key is empty/env but neither CHAT_API_KEY nor " + "DASHSCOPE_API_KEY environment variable is set." + ) diff --git a/Core/configs/mongodb_config.py b/Core/configs/mongodb_config.py new file mode 100644 index 0000000..8c50a9b --- /dev/null +++ b/Core/configs/mongodb_config.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass + + +@dataclass +class MongoDBConfig: + """Configuration for MongoDB connection.""" + + uri: str = "mongodb://localhost:27017" + db_prefix: str = "bookrag" + system_db: str = "bookrag_system" + + def tenant_db_name(self, tenant_id: str) -> str: + """Return the MongoDB database name for a given tenant.""" + return f"{self.db_prefix}_{tenant_id}" + diff --git a/Core/configs/ontology_config.py b/Core/configs/ontology_config.py new file mode 100644 index 0000000..4294a2c --- /dev/null +++ b/Core/configs/ontology_config.py @@ -0,0 +1,104 @@ +import json +import os +from typing import List, Optional + +import yaml +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator + +from Core.utils.ontology_utils import normalize_entity_name, normalize_entity_type + + +class OntologyEntityConfig(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + entity_id: str = Field(alias="ontology_id") + entity_name: str = Field(alias="canonical_name") + entity_type: str + description: str = "" + aliases: List[str] = Field(default_factory=list) + keywords: List[str] = Field(default_factory=list) + status: str = "active" + ontology_source: str = "config" + + @field_validator("entity_id") + @classmethod + def _validate_entity_id(cls, value: str) -> str: + value = str(value or "").strip() + if not value: + raise ValueError("ontology_id cannot be empty") + return value + + @field_validator("entity_name") + @classmethod + def _normalize_entity_name(cls, value: str) -> str: + normalized = normalize_entity_name(value) + if not normalized: + raise ValueError("canonical_name cannot be empty") + return normalized + + @field_validator("entity_type") + @classmethod + def _normalize_entity_type(cls, value: str) -> str: + normalized = normalize_entity_type(value) + if not normalized: + raise ValueError("entity_type cannot be empty") + return normalized + + @field_validator("aliases", "keywords") + @classmethod + def _normalize_terms(cls, values: List[str]) -> List[str]: + normalized: List[str] = [] + for value in values or []: + item = normalize_entity_name(value) + if item and item not in normalized: + normalized.append(item) + return normalized + + @model_validator(mode="after") + def _ensure_canonical_alias(self) -> "OntologyEntityConfig": + if self.entity_name not in self.aliases: + self.aliases.insert(0, self.entity_name) + return self + + +class OntologyConfig(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + enabled: bool = False + path: Optional[str] = None + entities: List[OntologyEntityConfig] = Field(default_factory=list) + mapping_threshold: float = Field(default=1.0, ge=0.0, le=1.0) + allow_provisional_entities: bool = True + use_query_resolution: bool = True + + @model_validator(mode="after") + def _load_entities_from_path(self) -> "OntologyConfig": + if not self.path: + return self + + loaded_entities = self._read_entities_file(self.path) + merged = list(self.entities) + seen_ids = {entity.entity_id for entity in merged} + for entity in loaded_entities: + if entity.entity_id not in seen_ids: + merged.append(entity) + seen_ids.add(entity.entity_id) + self.entities = merged + return self + + @staticmethod + def _read_entities_file(path: str) -> List[OntologyEntityConfig]: + if not os.path.exists(path): + raise FileNotFoundError(f"Ontology file not found: {path}") + + with open(path, "r", encoding="utf-8") as handle: + if path.endswith(".json"): + payload = json.load(handle) + else: + payload = yaml.safe_load(handle) + + raw_entities = payload.get("entities", payload) if isinstance(payload, dict) else payload + if not isinstance(raw_entities, list): + raise ValueError("Ontology file must contain a list of entities or an 'entities' list") + + return [OntologyEntityConfig(**entity) for entity in raw_entities] \ No newline at end of file diff --git a/Core/configs/rerank_config.py b/Core/configs/rerank_config.py index d1b5eea..5b2a700 100644 --- a/Core/configs/rerank_config.py +++ b/Core/configs/rerank_config.py @@ -1,10 +1,28 @@ +import os from dataclasses import dataclass +from dotenv import load_dotenv + +load_dotenv() + @dataclass class RerankerConfig: model_name: str = "Qwen/Qwen3-Reranker-0.6B" max_length: int = 8192 device: str = "cuda:2" - backend: str = "local" # Options: 'local', 'vllm' + backend: str = "local" # Options: 'local', 'vllm', 'jina' api_base: str = "http://localhost:8011/v1" + api_key: str = "" + + def __post_init__(self): + if self.backend not in ["local", "vllm", "jina", "openai"]: + raise ValueError(f"Unsupported reranker backend: {self.backend}") + # Resolve 'env' placeholder → read from RERANKER_API_KEY or JINA_API_KEY environment variable + if self.api_key == "env": + self.api_key = os.environ.get("RERANKER_API_KEY", "") or os.environ.get("JINA_API_KEY", "") + if not self.api_key: + raise ValueError( + "RerankerConfig.api_key is 'env' but neither RERANKER_API_KEY " + "nor JINA_API_KEY environment variable is set." + ) diff --git a/Core/configs/system_config.py b/Core/configs/system_config.py index fdc45d5..89fd561 100644 --- a/Core/configs/system_config.py +++ b/Core/configs/system_config.py @@ -1,13 +1,21 @@ +import os + import yaml from Core.configs.mineru_config import MinerU +from Core.configs.docling_config import DoclingConfig +from Core.configs.entity_resolution_config import EntityResolutionConfig from Core.configs.llm_config import LLMConfig from Core.configs.tree_config import TreeConfig from Core.configs.graph_config import GraphConfig +from Core.configs.ontology_config import OntologyConfig from Core.configs.vlm_config import VLMConfig from Core.configs.rag_config import RAGConfig from Core.configs.vdb_config import VDBConfig +from Core.configs.falkordb_config import FalkorDBConfig +from Core.configs.mongodb_config import MongoDBConfig from pydantic import BaseModel, Field -from typing import Optional +from typing import Optional, Any +from datetime import datetime class SystemConfig(BaseModel): @@ -21,10 +29,16 @@ class SystemConfig(BaseModel): vlm: VLMConfig = Field(default_factory=VLMConfig) mineru: MinerU = Field(default_factory=MinerU) + # Parser selection: "mineru" (default) or "docling" + parser: Optional[str] = "mineru" + docling: Optional[DoclingConfig] = Field(default_factory=DoclingConfig) + # Index Configurations tree: TreeConfig = Field(default_factory=TreeConfig) graph: GraphConfig = Field(default_factory=GraphConfig) vdb: VDBConfig = Field(default_factory=VDBConfig) + ontology: OntologyConfig = Field(default_factory=OntologyConfig) + entity_resolution: EntityResolutionConfig = Field(default_factory=EntityResolutionConfig) # Other Index selection index_type: Optional[str] = "gbc" # Options: "gbc", "tree", "vanilla", "bm25", "raptor", "pdf_vanilla" @@ -38,11 +52,31 @@ class SystemConfig(BaseModel): pdf_path: Optional[str] = "/home/wangshu/multimodal/GBC-RAG/test/double_paper.pdf" save_path: Optional[str] = "/home/wangshu/multimodal/GBC-RAG/test/tree_index" - # # 新增: 专门用于存放评估结果的根目录 - # evaluation_output_path: Optional[str] = Field( - # default="/home/wangshu/multimodal/GBC-RAG/test/tree_index/evaluation_results", - # description="Root directory to save evaluation results." - # ) + # Multi-tenant identifiers (optional for backward compatibility) + tenant_id: Optional[str] = None + doc_id: Optional[str] = None + + # Document language hint (ISO 639-1). Used by the legal-heading + # detector, incomplete-paragraph heuristics, and other language-aware + # pipeline stages. Set to "auto" (default) for automatic detection + # from extracted text, or an explicit code like "en" or "id". + document_lang: Optional[str] = Field( + default="auto", + description="ISO 639-1 language code of the document content, or " + "'auto' for automatic detection. " + "Supported: auto, en, id, de, fr, es, pt, it, nl, th, zh, ja, ko, ar.", + ) + + # Document temporal metadata (optional, for recency-aware RAG) + document_date: Optional[datetime] = Field( + default=None, + description="Original authoring/publishing date of the document. " + "Used for temporal awareness in cross-document RAG queries.", + ) + + # Database configurations + falkordb: Any = Field(default_factory=FalkorDBConfig) + mongodb: Any = Field(default_factory=MongoDBConfig) def load_system_config(path: str = "../configs/default.yaml") -> SystemConfig: @@ -53,5 +87,21 @@ def load_system_config(path: str = "../configs/default.yaml") -> SystemConfig: rag_data = raw_config["rag"] raw_config["rag"] = {"strategy_config": rag_data} + ontology_data = raw_config.get("ontology") + if isinstance(ontology_data, dict) and ontology_data.get("path"): + ontology_path = ontology_data["path"] + if not os.path.isabs(ontology_path): + ontology_data["path"] = os.path.abspath( + os.path.join(os.path.dirname(path), ontology_path) + ) + + entity_resolution_data = raw_config.get("entity_resolution") + if isinstance(entity_resolution_data, dict) and entity_resolution_data.get("global_vdb_dir"): + global_vdb_dir = entity_resolution_data["global_vdb_dir"] + if not os.path.isabs(global_vdb_dir): + entity_resolution_data["global_vdb_dir"] = os.path.abspath( + os.path.join(os.path.dirname(path), global_vdb_dir) + ) + cfg = SystemConfig(**raw_config) return cfg diff --git a/Core/configs/vlm_config.py b/Core/configs/vlm_config.py index c1bf0a7..e76802c 100644 --- a/Core/configs/vlm_config.py +++ b/Core/configs/vlm_config.py @@ -1,11 +1,27 @@ +import os from dataclasses import dataclass +from dotenv import load_dotenv + +load_dotenv() # ensure .env is loaded when used outside the API server @dataclass class VLMConfig: - backend: str = "ollama" # "qwen", "gpt", "ollama" - model_name: str = "qwen2.5vl:6k" + backend: str = "gpt" # "qwen", "gpt", "ollama" + model_name: str = "Qwen/Qwen3.5-35B-A3B-AWQ" max_tokens: int = 6000 - temperature: float = 0.7 - api_key: str = "None" - api_base: str = "http://localhost:11434" + temperature: float = 0.1 + api_key: str = "openai" + api_base: str = "http://localhost:8003/v1" + + def __post_init__(self): + # Allow api_key to be resolved from environment variable + if not self.api_key or self.api_key in ("env", "ENV"): + env_key = os.environ.get("VL_API_KEY", "") or os.environ.get("DASHSCOPE_API_KEY", "") + if env_key: + self.api_key = env_key + else: + raise ValueError( + "VLM api_key is empty/env but neither VL_API_KEY nor " + "DASHSCOPE_API_KEY environment variable is not set." + ) diff --git a/Core/construct_index.py b/Core/construct_index.py index 684c62b..22812c8 100644 --- a/Core/construct_index.py +++ b/Core/construct_index.py @@ -21,7 +21,7 @@ from Core.utils.file_utils import save_indexing_stats -def construct_GBC_index(cfg: SystemConfig, tree_only: bool = False): +def construct_gbc_index(cfg: SystemConfig, tree_only: bool = False): """ Construct the GBC index from the document tree and knowledge graph. @@ -57,7 +57,7 @@ def construct_GBC_index(cfg: SystemConfig, tree_only: bool = False): graph_index = build_knowledge_graph(tree_index, cfg) # The 'kg_extraction' stage is recorded inside build_knowledge_graph - gbc_index = GBC(config=cfg, graph_index=graph_index, TreeIndex=tree_index) + gbc_index = GBC(config=cfg, graph_index=graph_index, tree_index=tree_index) gbc_index.save_gbc_index() # rebuild vdb @@ -143,32 +143,9 @@ def compute_mm_reranker(cfg: SystemConfig, group: pd.DataFrame): tree_index = build_tree_from_pdf(cfg) compute_mm_embedding(cfg, tree_index) - + compute_mm_embedding_question(cfg, group) if __name__ == "__main__": - print("test") - - # parser = argparse.ArgumentParser(description="Extract text content from PDF files.") - # parser.add_argument( - # "--config_path", - # type=str, - # default="/home/wangshu/multimodal/GBC-RAG/config/gbc.yaml", - # help="Path to the configuration file.", - # ) - - # args = parser.parse_args() - - # cfg = load_system_config(args.config_path) - - # if not os.path.exists(cfg.save_path): - # os.makedirs(cfg.save_path) - # log.info(f"Created directory: {cfg.save_path}") - # else: - # log.info(f"Directory already exists: {cfg.save_path}") - - # construct_vdb(cfg) - - # gbc_index = construct_GBC_index(cfg) - # log.info("GBC index construction completed successfully.") + print("Use main.py to run indexing workflows.") diff --git a/Core/pipelines/doc_tree_builder.py b/Core/pipelines/doc_tree_builder.py index 3eee2d1..48d0ce6 100644 --- a/Core/pipelines/doc_tree_builder.py +++ b/Core/pipelines/doc_tree_builder.py @@ -2,7 +2,9 @@ from Core.pipelines.tree_node_builder import create_node_by_type from Core.pipelines.outline_extractor import extract_pdf_outline_in_chunks from Core.pipelines.pdf_refiner import pdf_info_refiner -from Core.provider.extract_pdf_info import parse_doc, merge_middle_content +from Core.pipelines.legal_heading_detector import detect_legal_headings, detect_document_language +# MinerU imports are deferred to avoid top-level dependency on doclayout_yolo +# when using the Docling parser. See the ``else`` branch below. from Core.pipelines.tree_node_summary import generate_tree_node_summary from Core.configs.system_config import SystemConfig from Core.provider.llm import LLM @@ -82,51 +84,83 @@ def build_tree_from_pdf(cfg: SystemConfig, reforce: bool = False) -> DocumentTre tree_index = DocumentTree(meta_dict=meta_dict, cfg=cfg) - backend = cfg.mineru.backend - server_url = cfg.mineru.server_url - method = cfg.mineru.method + import json + + parser = getattr(cfg, "parser", "mineru") or "mineru" base_file_name = Path(cfg.pdf_path).stem - tmp_save_path = os.path.join( - cfg.save_path, method, f"{base_file_name}_merged_content.json" - ) - if os.path.exists(tmp_save_path) and not reforce: - # tmp load pdf_list - import json + # Each parser writes its cached pdf_list to its own sub-directory so the + # two caches never collide even when the same save_path is reused. + if parser == "docling": + tmp_save_path = os.path.join( + cfg.save_path, "docling", f"{base_file_name}_merged_content.json" + ) + else: + method = cfg.mineru.method + tmp_save_path = os.path.join( + cfg.save_path, method, f"{base_file_name}_merged_content.json" + ) + if os.path.exists(tmp_save_path) and not reforce: + # Load cached pdf_list (parser-agnostic from this point on) with open(tmp_save_path, "rb") as f: pdf_list = json.load(f) - print(f"Loaded content from {tmp_save_path}") + log.info(f"Loaded cached content from {tmp_save_path}") else: - # Extract content from the PDF file - log.info(f"Extracting content from {cfg.pdf_path}...") - middle_json, content_list = parse_doc( - cfg.pdf_path, - output_dir=cfg.save_path, - backend=backend, - method=method, - server_url=server_url, - lang=cfg.mineru.lang, - ) - - file_name = str(Path(cfg.pdf_path).stem) - save_dir = os.path.join(cfg.save_path, method) - pdf_list = merge_middle_content( - middle_json, - content_list, - parse_dir=os.path.join(cfg.save_path, method), - save_dir=save_dir, - file_name=file_name, - ) # merge middle json content with content list. - - # tmp pdf_list save for fast test - log.info(f"Content extracted and saved to {tmp_save_path}") + log.info(f"Extracting content from '{cfg.pdf_path}' using parser='{parser}' …") + + if parser == "docling": + from Core.provider.extract_pdf_info_docling import parse_doc_with_docling + + pdf_list = parse_doc_with_docling( + pdf_path=cfg.pdf_path, + output_dir=cfg.save_path, + cfg=cfg.docling, + ) + # Persist for subsequent fast loads (mirrors what merge_middle_content + # does for the MinerU path). + docling_cache_dir = os.path.join(cfg.save_path, "docling") + os.makedirs(docling_cache_dir, exist_ok=True) + with open(tmp_save_path, "w", encoding="utf-8") as f: + json.dump(pdf_list, f, ensure_ascii=False, indent=4) + log.info(f"[Docling] Extracted content cached to '{tmp_save_path}'") + else: + # ── MinerU (default) ────────────────────────────────────────── + from Core.provider.extract_pdf_info import parse_doc, merge_middle_content + + backend = cfg.mineru.backend + server_url = cfg.mineru.server_url + method = cfg.mineru.method + + middle_json, content_list = parse_doc( + cfg.pdf_path, + output_dir=cfg.save_path, + backend=backend, + method=method, + server_url=server_url, + lang=cfg.mineru.lang, + ) + + file_name = str(Path(cfg.pdf_path).stem) + save_dir = os.path.join(cfg.save_path, method) + pdf_list = merge_middle_content( + middle_json, + content_list, + parse_dir=os.path.join(cfg.save_path, method), + save_dir=save_dir, + file_name=file_name, + ) + log.info(f"[MinerU] Extracted content saved to '{tmp_save_path}'") llm = LLM(cfg.llm) vlm = VLM(cfg.vlm) if cfg.tree.use_vlm else None - pdf_list = pdf_info_refiner(pdf_list, llm) - title_outline = extract_pdf_outline_in_chunks(pdf_list, llm) + lang = getattr(cfg, "document_lang", "auto") or "auto" + if lang == "auto": + lang = detect_document_language(pdf_list, fallback="en") + pdf_list = pdf_info_refiner(pdf_list, llm, lang=lang) + pdf_list = detect_legal_headings(pdf_list, lang=lang) + title_outline = extract_pdf_outline_in_chunks(pdf_list, llm, lang=lang) tree_index = construct_tree_index( tree_index=tree_index, pdf_list=pdf_list, title_outline=title_outline ) diff --git a/Core/pipelines/kg_builder.py b/Core/pipelines/kg_builder.py index 887ffd2..a2f615c 100644 --- a/Core/pipelines/kg_builder.py +++ b/Core/pipelines/kg_builder.py @@ -3,6 +3,7 @@ from Core.pipelines.kg_extractor import KGExtractor from Core.pipelines.kg_refiner import KGRefiner from Core.configs.system_config import SystemConfig +from Core.utils.ontology_utils import align_entities_to_ontology from Core.provider.llm import LLM from Core.provider.vlm import VLM @@ -12,28 +13,6 @@ log = logging.getLogger(__name__) -# print log for test -from rich.logging import RichHandler - -import os -import time - -# log_dir = "/home/wangshu/multimodal/GBC-RAG/test/index_qwen3/logs" -# if not os.path.exists(log_dir): -# os.makedirs(log_dir) -# log_file = os.path.join(log_dir, f"kg_builder_{time.strftime('%Y%m%d_%H%M%S')}.log") -# logging.basicConfig( -# level="INFO", -# format="%(asctime)s - %(levelname)s - %(message)s", -# datefmt="%H:%M:%S", -# handlers=[ -# RichHandler(rich_tracebacks=True), # RichHandler 会继续使用自己的漂亮格式 -# logging.FileHandler( -# log_file, encoding="utf-8" -# ), # FileHandler 会使用上面定义的 format -# ], -# ) - def build_knowledge_graph(tree: DocumentTree, cfg: SystemConfig): """ @@ -60,7 +39,15 @@ def build_knowledge_graph(tree: DocumentTree, cfg: SystemConfig): else: variant = None - graph_index = Graph(save_path=cfg.save_path, variant=variant) + # Pass FalkorDB config when tenant/doc IDs are set + falkordb_cfg = cfg.falkordb if (cfg.tenant_id and cfg.doc_id) else None + graph_index = Graph( + save_path=cfg.save_path, + variant=variant, + tenant_id=cfg.tenant_id, + doc_id=cfg.doc_id, + falkordb_cfg=falkordb_cfg, + ) kg_extractor = KGExtractor( cfg_graph=cfg.graph, llm=llm, vlm=vlm, save_path=cfg.save_path @@ -74,60 +61,38 @@ def build_knowledge_graph(tree: DocumentTree, cfg: SystemConfig): kg_extract_res = [] - batch_process = True - - if batch_process: - log.info("Batch processing is enabled for knowledge graph extraction.") - batch_nodes = [] - batch_title_nodes = [] - batch_title_paths = [] - batch_sibling_nodes = [] - for node in tree.nodes: - # for node in tree.nodes[:30]: - if node == tree.root_node: - # Skip the root node since it doesn't have any other information - continue - if node.type == NodeType.TITLE: - # For title nodes, we collect the path and sibling nodes for batch processing - title_path = tree.get_path_from_root(node.index_id) - sibling_nodes = tree.get_sibling_nodes(node.index_id) - batch_title_nodes.append(node) - batch_title_paths.append(title_path) - batch_sibling_nodes.append(sibling_nodes) - else: - # For other nodes, we collect them for batch processing - batch_nodes.append(node) - - # Process title nodes in batches - if batch_title_nodes: - log.info("Processing title nodes in batches...") - res_dict = kg_extractor.batch_extract_titles( - nodes=batch_title_nodes, - title_paths=batch_title_paths, - sibling_nodes_list=batch_sibling_nodes, - ) - kg_extract_res.extend(res_dict) - - if batch_nodes: - log.info("Processing non-title nodes in batches...------") - res_dict = kg_extractor.batch_extract_kg(nodes=batch_nodes) - kg_extract_res.extend(res_dict) - - # resort the results based on node index - kg_extract_res.sort(key=lambda x: x.get("node_idx", -1)) - else: - for node in tree.nodes[:30]: - # Extract entities and relationships from the node - if node == tree.root_node: - # Skip the root node since it doesn't have any other information - continue - if node.type == NodeType.TITLE: - title_path = tree.get_path_from_root(node.index_id) - sibling_nodes = tree.get_sibling_nodes(node.index_id) - res_dict = kg_extractor.extract_title(node, title_path, sibling_nodes) - else: - res_dict = kg_extractor.extract_kg(node) - kg_extract_res.append(res_dict) + log.info("Batch processing is enabled for knowledge graph extraction.") + batch_nodes = [] + batch_title_nodes = [] + batch_title_paths = [] + batch_sibling_nodes = [] + for node in tree.nodes: + if node == tree.root_node: + continue + if node.type == NodeType.TITLE: + title_path = tree.get_path_from_root(node.index_id) + sibling_nodes = tree.get_sibling_nodes(node.index_id) + batch_title_nodes.append(node) + batch_title_paths.append(title_path) + batch_sibling_nodes.append(sibling_nodes) + else: + batch_nodes.append(node) + + if batch_title_nodes: + log.info("Processing title nodes in batches...") + res_dict = kg_extractor.batch_extract_titles( + nodes=batch_title_nodes, + title_paths=batch_title_paths, + sibling_nodes_list=batch_sibling_nodes, + ) + kg_extract_res.extend(res_dict) + + if batch_nodes: + log.info("Processing non-title nodes in batches...------") + res_dict = kg_extractor.batch_extract_kg(nodes=batch_nodes) + kg_extract_res.extend(res_dict) + + kg_extract_res.sort(key=lambda x: x.get("node_idx", -1)) log.info("Knowledge graph extraction completed.") log.info(f"Extracted {len(kg_extract_res)} nodes from the document tree.") @@ -137,17 +102,22 @@ def build_knowledge_graph(tree: DocumentTree, cfg: SystemConfig): log.info(f"Knowledge graph extraction cost: {kg_extraction_cost}") for res in kg_extract_res: + entities, relationships = align_entities_to_ontology( + entities=res.get("entities", []), + relationships=res.get("relations", []), + ontology_cfg=cfg.ontology, + ) if cfg.graph.refine_type == "basic": log.info("Using basic KG refinement.") kg_refiner.basic_kg_refiner( - entities=res.get("entities", []), - relationships=res.get("relations", []), + entities=entities, + relationships=relationships, source_id=res.get("node_idx", -1), ) elif cfg.graph.refine_type == "advanced": kg_refiner.advanced_kg_refiner( - entities=res.get("entities", []), - relationships=res.get("relations", []), + entities=entities, + relationships=relationships, source_id=res.get("node_idx", -1), ) @@ -161,7 +131,6 @@ def build_knowledge_graph(tree: DocumentTree, cfg: SystemConfig): kg_refiner.close() return graph_index - # graph_index.save_graph() if __name__ == "__main__": diff --git a/Core/pipelines/kg_refiner.py b/Core/pipelines/kg_refiner.py index 1e7ad97..e0ee575 100644 --- a/Core/pipelines/kg_refiner.py +++ b/Core/pipelines/kg_refiner.py @@ -55,6 +55,7 @@ def __init__( max_length=graph_config.embedding_config.max_length, device=graph_config.embedding_config.device, api_base=graph_config.embedding_config.api_base, + api_key=graph_config.embedding_config.api_key, ) self.reranker = TextRerankerProvider( model_name=graph_config.reranker_config.model_name, @@ -62,6 +63,7 @@ def __init__( max_length=graph_config.reranker_config.max_length, backend=graph_config.reranker_config.backend, api_base=graph_config.reranker_config.api_base, + api_key=graph_config.reranker_config.api_key, ) # delete the old vector database if exists self.vdb_path = os.path.join(save_path, "kg_vdb") @@ -122,6 +124,51 @@ def get_latest_entity_name(self, node_name: str) -> str: # Recursively find the latest entity name return self.get_latest_entity_name(latest_node_name) + def _merge_entity_metadata( + self, + primary_entity: Entity, + secondary_entity: Entity, + description: str, + entity_name: Optional[str] = None, + entity_type: Optional[str] = None, + ) -> Entity: + merged_aliases = [] + for alias in [ + primary_entity.entity_name, + secondary_entity.entity_name, + *primary_entity.aliases, + *secondary_entity.aliases, + ]: + alias = str(alias or "").strip() + if alias and alias not in merged_aliases: + merged_aliases.append(alias) + + entity_role = primary_entity.entity_role or secondary_entity.entity_role or "provisional" + if "canonical" in {primary_entity.entity_role, secondary_entity.entity_role}: + entity_role = "canonical" + + entity_id = primary_entity.entity_id or secondary_entity.entity_id + canonical_id = ( + primary_entity.canonical_id + or secondary_entity.canonical_id + or entity_id + ) + + return Entity( + entity_name=entity_name or primary_entity.entity_name, + entity_type=entity_type or primary_entity.entity_type, + description=description, + entity_id=entity_id, + canonical_id=canonical_id, + entity_role=entity_role, + aliases=merged_aliases, + mapping_confidence=max( + primary_entity.mapping_confidence, secondary_entity.mapping_confidence + ), + ontology_source=primary_entity.ontology_source or secondary_entity.ontology_source, + source_ids=set(primary_entity.source_ids).union(secondary_entity.source_ids), + ) + def entity_merge( self, old_entity: Entity, @@ -143,28 +190,65 @@ def entity_merge( # 2. merge the two entities old_node_name = self.graph_index.get_node_name_from_entity(old_entity) new_node_name = self.graph_index.get_node_name_from_entity(new_entity) - if (old_node_name == new_node_name) or merged_to_old_entity: + canonical_entity = None + if old_entity.entity_role == "canonical": + canonical_entity = old_entity + elif new_entity.entity_role == "canonical": + canonical_entity = new_entity + + if canonical_entity is not None: + log.info("merged with canonical entity metadata preserved") + description = ( + old_entity.description + self._DESCRIPTION_SEP_ + new_entity.description + ) + primary_entity = canonical_entity + secondary_entity = new_entity if canonical_entity == old_entity else old_entity + merged_entity = self._merge_entity_metadata( + primary_entity=primary_entity, + secondary_entity=secondary_entity, + description=description, + ) + elif (old_node_name == new_node_name) or merged_to_old_entity: # 2.1 if have the same node name, or merged to old entity, # Directly merged if the entity name and type are the same log.info("merged directly") new_description = ( old_entity.description + self._DESCRIPTION_SEP_ + new_entity.description ) - merged_entity = Entity( - entity_name=old_entity.entity_name, - entity_type=old_entity.entity_type, + merged_entity = self._merge_entity_metadata( + primary_entity=old_entity, + secondary_entity=new_entity, description=new_description, - source_ids=set(old_entity.source_ids).union(new_entity.source_ids), ) else: # 2.2 if have different node name, use LLM to create new entity log.info("merged by LLM summarization") - old_entity_dict = old_entity.model_dump(exclude={"source_ids"}) + old_entity_dict = old_entity.model_dump( + exclude={ + "source_ids", + "entity_id", + "canonical_id", + "entity_role", + "aliases", + "mapping_confidence", + "ontology_source", + } + ) old_entity_dict["description"] = truncate_description( old_entity_dict["description"], max_words=200 ) - new_entity_dict = new_entity.model_dump(exclude={"source_ids"}) + new_entity_dict = new_entity.model_dump( + exclude={ + "source_ids", + "entity_id", + "canonical_id", + "entity_role", + "aliases", + "mapping_confidence", + "ontology_source", + } + ) new_entity_dict["description"] = truncate_description( new_entity_dict["description"], max_words=200 ) @@ -191,11 +275,12 @@ def entity_merge( old_entity.description + self._DESCRIPTION_SEP_ + new_entity.description ) - merged_entity = Entity( + merged_entity = self._merge_entity_metadata( + primary_entity=old_entity, + secondary_entity=new_entity, + description=description, entity_name=res_entity.entity_name, entity_type=res_entity.entity_type, - description=description, - source_ids=set(old_entity.source_ids).union(new_entity.source_ids), ) # 2.3 If the llm generated merged entity is another entity (entityC) in the graph, @@ -228,11 +313,12 @@ def entity_merge( old_entity_type=entity_c.entity_type, new_entity=old_entity, ) - merged_entity.description += ( - self._DESCRIPTION_SEP_ + entity_c.description - ) - merged_entity.source_ids = set(merged_entity.source_ids).union( - entity_c.source_ids + merged_entity = self._merge_entity_metadata( + primary_entity=merged_entity, + secondary_entity=entity_c, + description=( + merged_entity.description + self._DESCRIPTION_SEP_ + entity_c.description + ), ) # since entity_c is the same as merged_entity, no need to update alias map @@ -291,6 +377,7 @@ def basic_kg_refiner( entity.entity_name, entity.entity_type ) merged_entity = self.entity_merge(existing_entity, entity) + entity_map[entity.entity_name] = merged_entity entity_map[existing_entity.entity_name] = merged_entity add_entity_list.append(merged_entity) @@ -299,12 +386,23 @@ def basic_kg_refiner( # Update relationships for rel in relationships: - if rel.src_entity_name in entity_map: - rel.src_entity_name = entity_map[rel.src_entity_name].entity_name - src_type = entity_map[rel.src_entity_name].entity_type - if rel.tgt_entity_name in entity_map: - rel.tgt_entity_name = entity_map[rel.tgt_entity_name].entity_name - tgt_type = entity_map[rel.tgt_entity_name].entity_type + old_src_name = rel.src_entity_name + old_tgt_name = rel.tgt_entity_name + src_type = None + tgt_type = None + if old_src_name in entity_map: + mapped_src = entity_map[old_src_name] + rel.src_entity_name = mapped_src.entity_name + src_type = mapped_src.entity_type + if old_tgt_name in entity_map: + mapped_tgt = entity_map[old_tgt_name] + rel.tgt_entity_name = mapped_tgt.entity_name + tgt_type = mapped_tgt.entity_type + if src_type is None or tgt_type is None: + log.info( + f"Relationship {rel} has missing entity types. Skipping this relationship." + ) + continue self.graph_index.add_kg_edge(rel=rel, src_type=src_type, tgt_type=tgt_type) def get_vdb_meta_data(self, entity: Entity) -> dict: @@ -316,11 +414,7 @@ def get_vdb_meta_data(self, entity: Entity) -> dict: dict: The metadata dictionary without source_ids. since vdb does not support list type. """ - return { - "entity_name": entity.entity_name, - "entity_type": entity.entity_type, - "description": entity.description, - } + return entity.to_vdb_metadata() def add_entities_to_vdb(self, entities: List[Entity]) -> None: """ @@ -475,7 +569,7 @@ def metadata_str(meta_data: dict): else: break - if len(sel_entities) == ranked_results: + if len(sel_entities) == len(ranked_results): # 4.3 If all entities are selected, return empty list return [] @@ -538,23 +632,20 @@ def er_selection_by_llm( if 0 <= select_id < len(similar_entities): # Log the selection and reason - log.info( - f"LLM selected entity ID: {select_id}, " f"Reason: {res.explanation}" - ) + log.info(f"LLM selected entity ID: {select_id}, Reason: {res.explanation}") # Log the new entity and the selected similar entity log.info("New Entity Info:") log.info( f"Entity Name: {new_entity.entity_name}, Entity Type: {new_entity.entity_type}" ) - log.info(f"LLM selected Entity Info:") + log.info("LLM selected Entity Info:") log.info( f"Entity Name: {similar_entities[select_id].entity_name}, Entity Type: {similar_entities[select_id].entity_type}" ) return similar_entities[select_id] - else: - print(f"Warning: LLM returned an out-of-bounds ID: {select_id}") - return None + print(f"Warning: LLM returned an out-of-bounds ID: {select_id}") + return None def entity_resolution(self, new_entity: Entity) -> Entity: """ @@ -682,10 +773,9 @@ def process_relationships( "Skipping this relationship." ) continue - else: - self.graph_index.add_kg_edge( - rel=rel, src_type=src_type, tgt_type=tgt_type - ) + self.graph_index.add_kg_edge( + rel=rel, src_type=src_type, tgt_type=tgt_type + ) def _debug_check_num(self): num_node_graph = len(self.graph_index.kg.nodes()) @@ -870,7 +960,6 @@ def refine_entities(self): f"Refined {len(add_entities)} entities and added them to the vector database." ) self._debug_check_num() - return def refine_relation(self): # delete self loop in graph index diff --git a/Core/pipelines/legal_heading_detector.py b/Core/pipelines/legal_heading_detector.py new file mode 100644 index 0000000..6917f61 --- /dev/null +++ b/Core/pipelines/legal_heading_detector.py @@ -0,0 +1,230 @@ +""" +Legal heading detector — identifies structured headings in legal documents +using language-aware regex patterns. + +Supported languages: + - ``en`` — English (Article, Section, Chapter, Part, Clause, Schedule, …) + - ``id`` — Bahasa Indonesia (BAB, Bagian, Paragraf, Pasal, Ayat, …) + +The detector is intentionally conservative: it only promotes items whose +``text_level`` is ``-1`` (body text) and whose *entire trimmed text* matches +a known legal heading pattern. This avoids false-positives on sentences +that merely *mention* a legal keyword. +""" + +from __future__ import annotations + +import logging +import re +from typing import Dict, List, Optional, Tuple + +try: + from langdetect import detect as _langdetect_detect + from langdetect import DetectorFactory + + # Make langdetect deterministic + DetectorFactory.seed = 0 + _HAS_LANGDETECT = True +except ImportError: # pragma: no cover + _HAS_LANGDETECT = False + +log = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Pattern registry +# --------------------------------------------------------------------------- +# Each entry is ``(compiled_regex, assigned_text_level)``. +# Lower ``text_level`` values indicate higher hierarchy levels so that the +# outline extractor can nest them correctly. +# +# ``text_level`` assignments (per language): +# 0 = top-level division (BAB / Chapter / Title / Part) +# 1 = section / Bagian +# 2 = sub-section / Paragraf +# 3 = article / Pasal / Section / Clause +# 4 = clause / verse / Ayat / sub-clause +# --------------------------------------------------------------------------- + +_NUM = r"(?:[0-9]+(?:\.[0-9]+)*|[IVXLCDM]+|[ivxlcdm]+)" +_ALPHA = r"[A-Za-z]" + +# ── English patterns ────────────────────────────────────────────────────── + +_EN_PATTERNS: List[Tuple[re.Pattern, int]] = [ + # Level 0 – top-level + (re.compile(rf"^(?:TITLE|Title)\s+{_NUM}\.?(?:\s.*)?$"), 0), + (re.compile(rf"^(?:PART|Part)\s+{_NUM}\.?(?:\s.*)?$"), 0), + (re.compile(rf"^(?:CHAPTER|Chapter)\s+{_NUM}\.?(?:\s.*)?$"), 0), + # Level 1 – section + (re.compile(rf"^(?:DIVISION|Division)\s+{_NUM}\.?(?:\s.*)?$"), 1), + (re.compile(rf"^(?:ARTICLE|Article)\s+{_NUM}\.?(?:\s.*)?$"), 1), + (re.compile(rf"^(?:SCHEDULE|Schedule)\s+{_NUM}\.?(?:\s.*)?$"), 1), + # Level 2 – sub-section + (re.compile(rf"^(?:SECTION|Section)\s+{_NUM}\.?(?:\s.*)?$"), 2), + (re.compile(rf"^(?:ANNEX|Annex)\s+{_ALPHA}\.?(?:\s.*)?$"), 2), + # Level 3 – clause + (re.compile(rf"^(?:CLAUSE|Clause)\s+{_NUM}\.?(?:\s.*)?$"), 3), + (re.compile(rf"^§\s*{_NUM}\.?(?:\s.*)?$"), 3), + # Level 4 – sub-clause + (re.compile(rf"^(?:SUB-?CLAUSE|Sub-?clause)\s+{_NUM}\.?(?:\s.*)?$"), 4), +] + +# ── Bahasa Indonesia patterns ──────────────────────────────────────────── + +_ID_PATTERNS: List[Tuple[re.Pattern, int]] = [ + # Level 0 – BAB (Chapter) + (re.compile(rf"^BAB\s+{_NUM}\.?(?:\s.*)?$", re.IGNORECASE), 0), + # Level 1 – Bagian (Part/Section) + (re.compile(rf"^Bagian\s+(?:Kesatu|Kedua|Ketiga|Keempat|Kelima|Keenam|Ketujuh|Kedelapan|Kesembilan|Kesepuluh|{_NUM})\.?(?:\s.*)?$", re.IGNORECASE), 1), + # Level 2 – Paragraf (Paragraph/Sub-section) + (re.compile(rf"^Paragraf\s+{_NUM}\.?(?:\s.*)?$", re.IGNORECASE), 2), + # Level 3 – Pasal (Article) + (re.compile(rf"^Pasal\s+{_NUM}\.?(?:\s.*)?$", re.IGNORECASE), 3), + # Level 4 – Ayat (Verse/Clause) — usually inline, rarely standalone + (re.compile(rf"^Ayat\s+\({_NUM}\)\.?(?:\s.*)?$", re.IGNORECASE), 4), +] + +# ── Language → patterns map ────────────────────────────────────────────── + +_LANG_PATTERNS: Dict[str, List[Tuple[re.Pattern, int]]] = { + "en": _EN_PATTERNS, + "id": _ID_PATTERNS, +} + + +def _match_heading( + text: str, patterns: List[Tuple[re.Pattern, int]] +) -> Optional[int]: + """Return the ``text_level`` if *text* matches any pattern, else ``None``.""" + stripped = text.strip() + if not stripped: + return None + for pat, level in patterns: + if pat.match(stripped): + return level + return None + + +def detect_legal_headings( + pdf_list: List[Optional[Dict]], + lang: str = "en", +) -> List[Optional[Dict]]: + """Scan *pdf_list* and promote body-text items that match legal heading + patterns to headings by setting their ``text_level``. + + Parameters + ---------- + pdf_list: + The pipeline's intermediate list of content dicts. + lang: + ISO 639-1 language code. Falls back to English patterns if the + requested language is not registered. + + Returns + ------- + The same *pdf_list* (mutated in-place) with matched items promoted. + """ + patterns = _LANG_PATTERNS.get(lang, _LANG_PATTERNS.get("en", [])) + if not patterns: + log.warning("No legal heading patterns for lang='%s'; skipping.", lang) + return pdf_list + + promoted = 0 + for content in pdf_list: + if content is None: + continue + # Only consider body-text items (text_level == -1 or absent) + if content.get("type") != "text": + continue + current_level = content.get("text_level", -1) + if current_level >= 0: + continue # already a heading — don't override parser + + text = content.get("text", "") + level = _match_heading(text, patterns) + if level is not None: + content["text_level"] = level + promoted += 1 + log.debug("Promoted to heading (level %d): %s", level, text[:80]) + + log.info( + "Legal heading detection (lang=%s): promoted %d items to headings.", + lang, + promoted, + ) + return pdf_list + + + +# --------------------------------------------------------------------------- +# Automatic language detection from extracted text +# --------------------------------------------------------------------------- +_SUPPORTED_LANGS = {"en", "id", "de", "fr", "es", "pt", "it", "nl", "th", "zh", "ja", "ko", "ar"} + + +def detect_document_language( + pdf_list: List[Optional[Dict]], + fallback: str = "en", + sample_chars: int = 2000, +) -> str: + """Detect the dominant language of the document from its extracted text. + + Collects the first *sample_chars* characters of body text from *pdf_list* + and runs ``langdetect`` on the sample. + + Parameters + ---------- + pdf_list: + The pipeline's intermediate list of content dicts. + fallback: + Language code to return when detection fails or ``langdetect`` is not + installed. + sample_chars: + Maximum number of characters to sample for detection. + + Returns + ------- + An ISO 639-1 language code (e.g. ``"en"``, ``"id"``). + """ + if not _HAS_LANGDETECT: + log.warning("langdetect is not installed; falling back to '%s'.", fallback) + return fallback + + # Collect body text (text_level == -1 or absent) + sample_parts: list[str] = [] + collected = 0 + for content in pdf_list: + if content is None: + continue + if content.get("type") != "text": + continue + if content.get("text_level", -1) >= 0: + continue # skip headings — they may be too short / formulaic + text = content.get("text", "").strip() + if not text: + continue + sample_parts.append(text) + collected += len(text) + if collected >= sample_chars: + break + + sample = " ".join(sample_parts) + if len(sample) < 20: + log.info("Not enough text for language detection; falling back to '%s'.", fallback) + return fallback + + try: + detected = _langdetect_detect(sample) + # langdetect may return sub-tags like "zh-cn"; normalise + lang = detected.split("-")[0].lower() + if lang not in _SUPPORTED_LANGS: + log.info( + "Detected language '%s' is not supported; falling back to '%s'.", + lang, fallback, + ) + return fallback + log.info("Auto-detected document language: '%s'.", lang) + return lang + except Exception as exc: + log.warning("Language detection failed (%s); falling back to '%s'.", exc, fallback) + return fallback \ No newline at end of file diff --git a/Core/pipelines/outline_extractor.py b/Core/pipelines/outline_extractor.py index cce06b1..d069179 100644 --- a/Core/pipelines/outline_extractor.py +++ b/Core/pipelines/outline_extractor.py @@ -1,7 +1,7 @@ from typing import Optional, List from Core.provider.llm import LLM -from Core.prompts.outline_prompt import OUTLINE_EXTRACTION_PROMPT, OutlineExtraction +from Core.prompts.outline_prompt import OUTLINE_EXTRACTION_PROMPT, OutlineExtraction, get_outline_prompt from Core.utils.utils import get_json_content, num_tokens, enumerate_pdf_list import logging @@ -43,7 +43,7 @@ def outline_refine(outline_list: List[Optional[str]]) -> List[Optional[str]]: return outline_list -def extract_pdf_outline(pdf_list: List[Optional[str]], llm: LLM) -> List[Optional[str]]: +def extract_pdf_outline(pdf_list: List[Optional[str]], llm: LLM, lang: str = "en") -> List[Optional[str]]: """Extract the outline from the PDF content.""" pdf_length = len(pdf_list) @@ -77,7 +77,7 @@ def extract_pdf_outline(pdf_list: List[Optional[str]], llm: LLM) -> List[Optiona json_format_title = get_json_content(title_list, selected_columns=SELECT_COLS) - prompt = OUTLINE_EXTRACTION_PROMPT.format(json_title=json_format_title) + prompt = get_outline_prompt(lang).format(json_title=json_format_title) log.info(f"number of token in prompt: {num_tokens(prompt)}") response: OutlineExtraction = llm.get_json_completion(prompt, OutlineExtraction) outline_list = [] @@ -199,14 +199,15 @@ def calculate_effective_height(entry: dict) -> float: def extract_pdf_outline_in_chunks( - pdf_list: List[Optional[str]], llm: LLM + pdf_list: List[Optional[str]], llm: LLM, lang: str = "en" ) -> List[Optional[str]]: """ Extracts the PDF outline by processing titles in chunks with improved, stateful context building to ensure accurate hierarchical structure. """ # 1. More precise token budget calculation (Your Point 1 & 4) - prompt_template_tokens = num_tokens(OUTLINE_EXTRACTION_PROMPT.format(json_title="")) + outline_prompt = get_outline_prompt(lang) + prompt_template_tokens = num_tokens(outline_prompt.format(json_title="")) # Leave a 400-token buffer for the LLM's response generation and other overhead available_tokens_for_titles = llm.config.max_tokens - prompt_template_tokens - 500 available_tokens_for_titles = min(2000, available_tokens_for_titles) @@ -313,7 +314,7 @@ def extract_pdf_outline_in_chunks( # 4. Call LLM with the constructed prompt prompt_payload = context_titles + new_titles_for_chunk json_format_title = get_json_content(prompt_payload, SELECT_COLS) - prompt = OUTLINE_EXTRACTION_PROMPT.format(json_title=json_format_title) + prompt = outline_prompt.format(json_title=json_format_title) log.info(f"Number of tokens in prompt: {num_tokens(prompt)}") try: diff --git a/Core/pipelines/pdf_refiner.py b/Core/pipelines/pdf_refiner.py index a8a13af..d6a7065 100644 --- a/Core/pipelines/pdf_refiner.py +++ b/Core/pipelines/pdf_refiner.py @@ -16,73 +16,82 @@ log = logging.getLogger(__name__) -def is_likely_incomplete_paragraph(text: str) -> bool: +# --------------------------------------------------------------------------- +# Language-specific settings for incomplete-paragraph detection +# --------------------------------------------------------------------------- +_LANG_TERMINAL_PUNCTUATION = { + "en": r"[.!?]", + "id": r"[.!?]", # Bahasa Indonesia uses Latin punctuation + "de": r"[.!?]", + "fr": r"[.!?\u00BB]", # » can close a quote-sentence + "es": r"[.!?\u00BF\u00A1]", + "pt": r"[.!?]", + "it": r"[.!?]", + "nl": r"[.!?]", + "th": r"[\u0E2F\u0E46.!?]", # Thai ฯ, ๆ, plus Latin + "zh": r"[\u3002\uFF01\uFF1F.!?]", # 。!? + "ja": r"[\u3002\uFF01\uFF1F.!?]", + "ko": r"[\uFF0E\uFF01\uFF1F.!?]", + "ar": r"[\u061F\u06D4.!?]", # ؟ ۔ +} + +_LANG_INCOMPLETE_ENDINGS: dict[str, set[str]] = { + "en": { + "and", "or", "but", "because", "although", "however", + "if", "while", "when", "to", "for", "in", "of", "with", + "on", "as", "at", "by", "from", "such", "the", "a", "an", + }, + "id": { + "dan", "atau", "tetapi", "namun", "karena", "sebab", + "jika", "apabila", "bahwa", "dengan", "untuk", "pada", + "dari", "oleh", "yang", "di", "ke", "se", "ini", "itu", + }, +} + + +def is_likely_incomplete_paragraph(text: str, lang: str = "en") -> bool: """ - Determine if an English paragraph is likely incomplete (truncated due to page/column breaks). + Determine if a paragraph is likely incomplete (truncated due to + page / column breaks). Language-aware. - :param text: input text to check - :return: bool, True if the paragraph is likely incomplete, False otherwise - e.g. "He said, "This method is the best." -> False (complete) - e.g. "The quick brown fox jumps over the lazy dog and" -> True (incomplete) + :param text: input text to check + :param lang: ISO 639-1 language code (default ``"en"``) + :return: ``True`` if the paragraph is likely incomplete """ if not text: return False # 空文本不是我们关心的“不完整段落” text = text.strip() - # Rule 1: Filter out very short strings. They are likely standalone titles/captions, not paragraphs to be merged. + # Rule 1: Filter out very short strings — likely titles/captions. if len(text.split()) < 5 or len(text) < 25: return False - # --- From here, we look for clear signals of INCOMPLETENESS --- - - # Rule 2: Ending with a hyphen is a very strong signal of incompleteness (a word was split). + # Rule 2: Ending with a hyphen → word was split across pages. if text.endswith("-"): return True - # Handles cases like "said he," or "he said." + # Strip trailing quotes so we can inspect the "real" ending. cleaned_text = re.sub(r"['\"]+$", "", text) - # Rule 3: Ending with a comma, colon, or semicolon is also a strong signal. + # Rule 3: Ending with comma / colon / semicolon → strong signal. if cleaned_text.endswith((",", ":", ";")): return True - # Rule 4: Not ending with a standard terminal punctuation mark. This is the most common case. - if not re.search(r"[.!?]$", cleaned_text): + # Rule 4: Not ending with terminal punctuation (language-aware). + terminal_re = _LANG_TERMINAL_PUNCTUATION.get(lang, r"[.!?]") + if not re.search(terminal_re + r"$", cleaned_text): return True - # Rule 5: Ending with a common connector word (even if mistakenly followed by a period). - # e.g., "The quick brown fox jumps over the lazy dog and." - incomplete_endings = { - "and", - "or", - "but", - "because", - "although", - "however", - "if", - "while", - "when", - "to", - "for", - "in", - "of", - "with", - "on", - "as", - "at", - "by", - "from", - "such", - "the", - "a", - "an", - } + # Rule 5: Ending with a connector word (language-aware). + incomplete_endings = _LANG_INCOMPLETE_ENDINGS.get( + lang, _LANG_INCOMPLETE_ENDINGS.get("en", set()) + ) last_word_match = re.findall(r"\b\w+\b", cleaned_text) if last_word_match and last_word_match[-1].lower() in incomplete_endings: return True - # If no "incomplete" signals were triggered, we assume it's complete. + # No "incomplete" signals triggered → assume complete. return False @@ -355,7 +364,7 @@ def merge_text_and_mark_invalid(prev_content: dict, merged_list: list[dict]): print(f"{prev_content['text']}") # Print first 100 chars for debug -def text_merger(pdf_list: list[Optional[str]], llm: LLM) -> list[Optional[str]]: +def text_merger(pdf_list: list[Optional[str]], llm: LLM, lang: str = "en") -> list[Optional[str]]: incomplete_paragraphs = [] # for循环的逻辑可以更清晰地组织 for content in pdf_list: @@ -368,7 +377,7 @@ def text_merger(pdf_list: list[Optional[str]], llm: LLM) -> list[Optional[str]]: # The logic is now direct: "if the paragraph is likely incomplete, add it." text = content.get("text", "") - if is_likely_incomplete_paragraph(text): + if is_likely_incomplete_paragraph(text, lang=lang): incomplete_paragraphs.append(content) if not incomplete_paragraphs: @@ -744,7 +753,7 @@ def truncate_ocr_error_refiner( return pdf_list -def pdf_info_refiner(pdf_list: list[Optional[str]], llm: LLM) -> list[Optional[str]]: +def pdf_info_refiner(pdf_list: list[Optional[str]], llm: LLM, lang: str = "en") -> list[Optional[str]]: # Heuristic refiner for "-" error in OCR pdf_list = dash_line_refiner(pdf_list) # Heuristic refiner for OCR Error @@ -754,7 +763,7 @@ def pdf_info_refiner(pdf_list: list[Optional[str]], llm: LLM) -> list[Optional[s pdf_list = enumerate_pdf_list(pdf_list) # Then we refine the PDF information by merging incomplete paragraphs and tables - pdf_list = text_merger(pdf_list, llm) + pdf_list = text_merger(pdf_list, llm, lang=lang) pdf_list = table_merger(pdf_list, llm) # After merging, we need to re-enumerate the pdf_list diff --git a/Core/prompts/gbc_prompt.py b/Core/prompts/gbc_prompt.py index e7080ac..c29a936 100644 --- a/Core/prompts/gbc_prompt.py +++ b/Core/prompts/gbc_prompt.py @@ -515,3 +515,43 @@ class SecEXPSelection(BaseModel): --- Provided Analyses from Different Sources --- {partial_answers_str} """ + + +# ── Language-aware prompt helpers ───────────────────────────────────────────── + +_LANG_NAMES = { + "en": "English", + "id": "Bahasa Indonesia", + "zh": "Chinese", + "ja": "Japanese", + "ko": "Korean", + "ms": "Malay", + "th": "Thai", + "vi": "Vietnamese", + "de": "German", + "fr": "French", + "es": "Spanish", + "pt": "Portuguese", + "ar": "Arabic", +} + + +def get_lang_instruction(lang: str) -> str: + """Return a short instruction telling the LLM which language to respond in. + + Returns an empty string for English (default) to keep prompts minimal. + """ + if not lang or lang == "en": + return "" + lang_name = _LANG_NAMES.get(lang, lang) + return f" Respond in {lang_name}." + + +def get_iter_generation_sys_prompt(lang: str = "en") -> str: + """V2 iterative generation system prompt, with optional language instruction.""" + return ITER_GENERATION_SYS_PROMPT + get_lang_instruction(lang) + + +def get_synthesis_sys_prompt(lang: str = "en") -> str: + """V2 synthesis system prompt, with optional language instruction.""" + return SYNTHESIS_SYS_PROMPT.rstrip() + get_lang_instruction(lang) + "\n" diff --git a/Core/prompts/kg_prompt.py b/Core/prompts/kg_prompt.py index 96eb466..448c186 100644 --- a/Core/prompts/kg_prompt.py +++ b/Core/prompts/kg_prompt.py @@ -139,7 +139,7 @@ class EntityExtractionResult(BaseModel): - relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) -3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. +3. Return output as a single list of all the entities and relationships identified in steps 1 and 2. Preserve entity names exactly as they appear in the source text (do not translate them). Use **{record_delimiter}** as the list delimiter. 4. When finished, output {completion_delimiter} diff --git a/Core/prompts/outline_prompt.py b/Core/prompts/outline_prompt.py index 0240881..61f4698 100644 --- a/Core/prompts/outline_prompt.py +++ b/Core/prompts/outline_prompt.py @@ -12,6 +12,43 @@ class OutlineExtraction(BaseModel): outline: List[OutlineExtractionOutput] +# ── Language-specific supplements for outline extraction ────────────────────── + +_OUTLINE_LANG_SUPPLEMENTS = { + "id": """ +### Additional Guidance — Indonesian Legal Documents + +This document is written in **Bahasa Indonesia** and follows Indonesian legal drafting conventions. +Use the following standard hierarchy when determining heading levels: + +| Pattern | Meaning | Recommended Level | +|---|---|---| +| **BAB** (+ Roman numeral) | Chapter | `level: 1` (top-level section) | +| **Bagian** (+ ordinal word: Kesatu, Kedua …) | Part | `level: 2` | +| **Paragraf** (+ number) | Sub-section | `level: 3` | +| **Pasal** (+ number) | Article | `level: 3` or `level: 4` | + +**Key rules for Indonesian legal documents:** +- The document title (e.g., "UNDANG-UNDANG …", "PERATURAN …") is **always `level: 0`**. +- BAB headings use **Roman numerals** (BAB I, BAB II, …) and are major divisions (`level: 1`). +- Bagian headings use **Indonesian ordinal words** (Bagian Kesatu, Bagian Kedua, …) — they subdivide a BAB (`level: 2`). +- Paragraf headings use **Arabic numerals** (Paragraf 1, Paragraf 2, …) — they subdivide a Bagian (`level: 3`). +- Pasal headings use **Arabic numerals** (Pasal 1, Pasal 2, …) — they are articles within the nearest parent section. +- Preserve all heading text exactly as it appears; do **not** translate. +""", +} + + +def get_outline_prompt(lang: str = "en") -> str: + """Return the outline extraction prompt, optionally augmented with + language-specific guidance (e.g. Indonesian legal hierarchy).""" + supplement = _OUTLINE_LANG_SUPPLEMENTS.get(lang, "") + if supplement: + # Insert the supplement just before the final instruction line + return OUTLINE_EXTRACTION_PROMPT + supplement + return OUTLINE_EXTRACTION_PROMPT + + # 2219 tokens OUTLINE_EXTRACTION_PROMPT = """ You are an expert in document structure analysis. Your task is to generate a structured outline based on a given list of text segments. diff --git a/Core/provider/embedding.py b/Core/provider/embedding.py index 580d1b3..9e8ead9 100644 --- a/Core/provider/embedding.py +++ b/Core/provider/embedding.py @@ -453,6 +453,7 @@ def __init__( device: str = "auto", max_length: int = 8192, api_base: str = None, + api_key: str = "empty", ): self.model_name = model_name @@ -486,7 +487,7 @@ def __init__( elif self.backend == "ollama": self.device = "ollama_service" elif self.backend == "openai": - self.client = openai.OpenAI(api_key="empty", base_url=api_base) + self.client = openai.OpenAI(api_key=api_key, base_url=api_base) else: raise ValueError( f"Unsupported backend: '{self.backend}'. Choose 'local' or 'ollama'." @@ -615,13 +616,18 @@ def embed_texts(self, texts: List[str]) -> np.ndarray: elif self.backend == "openai": BATCH_SIZE = 8 - n = len(texts) + # Sanitize: replace empty/whitespace-only texts with a placeholder + sanitized_texts = [ + t if t and t.strip() else "" + for t in texts + ] + n = len(sanitized_texts) all_embeddings = [] num_batches = math.ceil(n / BATCH_SIZE) for i in tqdm( range(0, n, BATCH_SIZE), desc="Embedding texts", total=num_batches ): - chunk = texts[i : i + BATCH_SIZE] + chunk = sanitized_texts[i : i + BATCH_SIZE] response = self.client.embeddings.create( model=self.model_name, input=chunk, diff --git a/Core/provider/extract_pdf_info_docling.py b/Core/provider/extract_pdf_info_docling.py new file mode 100644 index 0000000..989e11d --- /dev/null +++ b/Core/provider/extract_pdf_info_docling.py @@ -0,0 +1,253 @@ +"""Docling-based document parser adapter for BookRAG. + +This module provides :func:`parse_doc_with_docling`, which converts a PDF (or +any Docling-supported format) into the canonical ``pdf_list`` format consumed +by the rest of the BookRAG pipeline. + +``pdf_list`` schema (one dict per content block): + - ``type`` : ``"text"`` | ``"image"`` | ``"table"`` | ``"equation"`` + - ``text`` : str — text content (text / equation nodes) + - ``text_level`` : int — ``-1`` = body text; ``0`` = chapter; ``1`` = section; … + - ``page_idx`` : int — 0-indexed page number + - ``pdf_id`` : int — sequential 0-based index matching position in list + - ``img_path`` : str — absolute path to saved PNG (image / table nodes) + - ``image_caption`` : list[str] + - ``image_footnote``: list[str] + - ``table_caption`` : list[str] + - ``table_footnote``: list[str] + - ``table_body`` : str — markdown table string + - ``middle_json`` : dict — raw Docling item metadata (non-critical) +""" + +from __future__ import annotations + +import json +import logging +import os +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from Core.configs.docling_config import DoclingConfig + +log = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Public entry point +# --------------------------------------------------------------------------- + +def parse_doc_with_docling( + pdf_path: str, + output_dir: str, + cfg: "DoclingConfig", +) -> list[dict]: + """Parse *pdf_path* with Docling and return a BookRAG-compatible ``pdf_list``. + + Args: + pdf_path: Path to the PDF (or other supported format) to parse. + output_dir: Root save directory; images are written to + ``/docling/images/``. + cfg: :class:`~Core.configs.docling_config.DoclingConfig` instance + that controls OCR engine, resolution, etc. + + Returns: + A flat list of dicts matching the ``pdf_list`` schema. The item at + position *i* always satisfies ``item["pdf_id"] == i``. + """ + # ------------------------------------------------------------------ setup + from docling.datamodel.base_models import InputFormat + from docling.datamodel.pipeline_options import PdfPipelineOptions + from docling.document_converter import DocumentConverter, PdfFormatOption + from docling_core.types.doc import PictureItem, SectionHeaderItem, TableItem, TextItem + + img_dir = os.path.join(output_dir, "docling", "images") + os.makedirs(img_dir, exist_ok=True) + + pipeline_options = _build_pipeline_options(cfg) + + converter = DocumentConverter( + format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} + ) + + log.info(f"[Docling] Converting '{pdf_path}' …") + conv_res = converter.convert(pdf_path) + doc = conv_res.document + doc_stem = Path(pdf_path).stem + + # ---------------------------------------------------------------- iterate + pdf_list: list[dict] = [] + pdf_id = 0 # 0-based; MUST equal position in list + pic_counter = 0 + tbl_counter = 0 + + for element, _level in doc.iterate_items(): + page_idx = _get_page_idx(element) + + if isinstance(element, SectionHeaderItem): + # SectionHeaderItem.level is 1-indexed (1 = chapter, 2 = section …) + heading_level = getattr(element, "level", 1) + pdf_list.append({ + "type": "text", + "text": element.text or "", + "text_level": max(0, heading_level - 1), # convert to 0-indexed + "page_idx": page_idx, + "pdf_id": pdf_id, + "middle_json": {"docling_label": str(element.label)}, + }) + + elif isinstance(element, TableItem): + tbl_counter += 1 + img_path = _save_element_image( + element, doc, img_dir, f"{doc_stem}-table-{tbl_counter}.png" + ) + captions, footnotes = _extract_captions_footnotes(element) + pdf_list.append({ + "type": "table", + "text": "", + "text_level": -1, + "page_idx": page_idx, + "pdf_id": pdf_id, + "img_path": img_path, + "table_caption": captions, + "table_footnote": footnotes, + "table_body": element.export_to_markdown(), + "middle_json": {"docling_label": "table"}, + }) + + elif isinstance(element, PictureItem): + pic_counter += 1 + img_path = _save_element_image( + element, doc, img_dir, f"{doc_stem}-picture-{pic_counter}.png" + ) + captions, footnotes = _extract_captions_footnotes(element) + pdf_list.append({ + "type": "image", + "text": "", + "text_level": -1, + "page_idx": page_idx, + "pdf_id": pdf_id, + "img_path": img_path, + "image_caption": captions, + "image_footnote": footnotes, + "middle_json": {"docling_label": "picture"}, + }) + + elif isinstance(element, TextItem): + # Covers TextItem, ListItem, CodeItem, FormulaItem, FootnoteItem, etc. + label_str = str(getattr(element, "label", "text")).lower() + is_formula = "formula" in label_str or "equation" in label_str + text = getattr(element, "text", "") or "" + if not text.strip(): + continue # skip empty elements — don't increment pdf_id + pdf_list.append({ + "type": "equation" if is_formula else "text", + "text": text, + "text_level": -1, + "page_idx": page_idx, + "pdf_id": pdf_id, + "middle_json": {"docling_label": label_str}, + }) + + else: + # Any remaining element types (e.g. page headers, key-value pairs) + text = getattr(element, "text", "") or "" + if not text.strip(): + continue + pdf_list.append({ + "type": "text", + "text": text, + "text_level": -1, + "page_idx": page_idx, + "pdf_id": pdf_id, + "middle_json": {"docling_label": str(getattr(element, "label", "unknown"))}, + }) + + pdf_id += 1 # advance only when an item was appended + + log.info(f"[Docling] Extracted {len(pdf_list)} content blocks from '{doc_stem}'.") + return pdf_list + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _build_pipeline_options(cfg: "DoclingConfig"): + """Construct :class:`PdfPipelineOptions` from *cfg*.""" + from docling.datamodel.pipeline_options import PdfPipelineOptions + + opts = PdfPipelineOptions() + opts.do_ocr = True + opts.do_table_structure = True + opts.generate_picture_images = True + opts.images_scale = cfg.images_scale + + ocr_opts = _build_ocr_options(cfg) + if ocr_opts is not None: + opts.ocr_options = ocr_opts + + return opts + + +def _build_ocr_options(cfg: "DoclingConfig"): + """Return an OCR-options object matching *cfg.ocr_engine*, or *None*.""" + engine = cfg.ocr_engine.lower() + force = cfg.force_full_page_ocr + lang = cfg.lang + + try: + if engine == "easyocr": + from docling.datamodel.pipeline_options import EasyOcrOptions + return EasyOcrOptions(force_full_page_ocr=force, lang=[lang]) + elif engine == "tesseract": + from docling.datamodel.pipeline_options import TesseractCliOcrOptions + return TesseractCliOcrOptions(force_full_page_ocr=force, lang=lang) + elif engine == "rapidocr": + from docling.datamodel.pipeline_options import RapidOcrOptions + return RapidOcrOptions(force_full_page_ocr=force) + except ImportError as exc: + log.warning( + f"[Docling] Could not import OCR options for engine '{engine}': {exc}. " + "Falling back to Docling's default OCR settings." + ) + return None + + +def _get_page_idx(element) -> int: + """Extract 0-indexed page number from a Docling element's provenance.""" + try: + if element.prov: + return max(0, element.prov[0].page_no - 1) # Docling page_no is 1-indexed + except (AttributeError, IndexError): + pass + return 0 + + +def _save_element_image(element, doc, img_dir: str, filename: str) -> str: + """Save element image as PNG and return its absolute path (or '' on failure).""" + img_path = os.path.join(img_dir, filename) + try: + pil_image = element.get_image(doc) + if pil_image is not None: + pil_image.save(img_path, "PNG") + return img_path + except Exception as exc: + log.warning(f"[Docling] Could not save image '{filename}': {exc}") + return "" + + +def _extract_captions_footnotes(element) -> tuple[list[str], list[str]]: + """Pull caption and footnote text lists from a Docling element.""" + captions: list[str] = [] + footnotes: list[str] = [] + for ref in getattr(element, "captions", []): + text = getattr(ref, "text", None) or getattr(ref, "__str__", lambda: "")() + if text and text.strip(): + captions.append(text.strip()) + for ref in getattr(element, "footnotes", []): + text = getattr(ref, "text", None) or getattr(ref, "__str__", lambda: "")() + if text and text.strip(): + footnotes.append(text.strip()) + return captions, footnotes + diff --git a/Core/provider/llm.py b/Core/provider/llm.py index f353edc..1f293aa 100644 --- a/Core/provider/llm.py +++ b/Core/provider/llm.py @@ -109,7 +109,10 @@ def get_completion( "max_tokens": get_max_output_tokens(messages, self.max_tokens), "frequency_penalty": self.frequency_penalty, "presence_penalty": self.presence_penalty, - "extra_body": {"chat_template_kwargs": {"enable_thinking": False}}, + "extra_body": { + "enable_thinking": False, # DashScope + "chat_template_kwargs": {"enable_thinking": False}, # vLLM + }, } if json_response: parameters["response_format"] = {"type": "json_object"} @@ -167,7 +170,8 @@ def get_json_completion( messages=messages, response_format=schema, extra_body={ - "chat_template_kwargs": {"enable_thinking": think_mode}, + "enable_thinking": think_mode, # DashScope + "chat_template_kwargs": {"enable_thinking": think_mode}, # vLLM }, ) diff --git a/Core/provider/rerank.py b/Core/provider/rerank.py index d3aae30..a0a8aa8 100644 --- a/Core/provider/rerank.py +++ b/Core/provider/rerank.py @@ -28,6 +28,7 @@ def __init__( torch_dtype: torch.dtype = torch.bfloat16, backend: str = "local", api_base: str = None, + api_key: str = None, ): """ 初始化Reranker Provider。 @@ -38,12 +39,14 @@ def __init__( max_length (int): 模型的最大序列长度。 use_flash_attention (bool): 是否尝试使用Flash Attention 2以提升性能。 torch_dtype (torch.dtype): 模型加载时使用的数据类型,如 torch.bfloat16。 - backend (str): 后端类型,支持 'local' 和 'vllm'。 + backend (str): 后端类型,支持 'local', 'vllm', 'jina'。 api_base (str): 如果使用 'vllm' 后端,必须提供API基础URL。 + api_key (str): API key for cloud backends (e.g., Jina). """ self.model_name = model_name self.max_length = max_length self.backend = backend.lower() + self.api_key = api_key # ========================================================== # vLLM 后端逻辑 @@ -97,9 +100,38 @@ def __init__( log.info("Reranker model loaded successfully.") + # ========================================================== + # Jina Reranker API 后端 + # ========================================================== + elif self.backend == "jina": + if not api_key: + raise ValueError("api_key must be provided for the 'jina' backend.") + self.rerank_url = api_base or "https://api.jina.ai/v1/rerank" + self.session = requests.Session() + self.session.headers.update({ + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + }) + log.info(f"Using Jina reranker backend. Model: {self.model_name}, Endpoint: {self.rerank_url}") + + # ========================================================== + # OpenAI-compatible chat completions backend (e.g., Starcore) + # Uses logprobs on yes/no tokens for scoring. + # ========================================================== + elif self.backend == "openai": + if not api_base: + raise ValueError("api_base must be provided for the 'openai' backend.") + self.chat_url = f"{api_base.rstrip('/')}/chat/completions" + self.session = requests.Session() + self.session.headers.update({ + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}" if self.api_key else "", + }) + log.info(f"Using OpenAI-compatible reranker backend. Model: {self.model_name}, Endpoint: {self.chat_url}") + else: raise ValueError( - f"Unsupported backend: {self.backend}. Choose 'local' or 'vllm'." + f"Unsupported backend: {self.backend}. Choose 'local', 'vllm', 'jina', or 'openai'." ) self._define_prompt_template() @@ -111,7 +143,7 @@ def clean_cache(self): torch.cuda.empty_cache() log.info("Cache cleaned.") else: - log.info(f"{self.backend} backend requires no local cache cleaning.") + log.debug(f"{self.backend} backend requires no local cache cleaning.") def close(self) -> None: """ @@ -132,10 +164,10 @@ def close(self) -> None: gc.collect() log.info("Local reranker resources released.") - elif self.backend == "vllm": + elif self.backend in ("vllm", "jina", "openai"): if hasattr(self, "session"): - self.session.close() # 关闭 requests session - log.info("vLLM backend session closed.") + self.session.close() + log.info(f"{self.backend} backend session closed.") log.info("TextRerankerProvider closed.") @@ -162,6 +194,61 @@ def _define_prompt_template(self): self.suffix, add_special_tokens=False ) + def _extract_score_from_logprobs(self, data: dict) -> float: + """ + Extract relevance score from OpenAI chat completion logprobs. + Looks for 'yes'/'no' token probabilities and returns P(yes). + Falls back to 0.5 if logprobs are unavailable. + """ + try: + choices = data.get("choices", []) + if not choices: + return 0.5 + + logprobs_data = choices[0].get("logprobs") + if not logprobs_data: + # No logprobs; try to parse from content + content = choices[0].get("message", {}).get("content", "").lower().strip() + if "yes" in content: + return 0.9 + elif "no" in content: + return 0.1 + return 0.5 + + content_logprobs = logprobs_data.get("content", []) + if not content_logprobs: + return 0.5 + + # Search through top_logprobs for yes/no tokens + top_logprobs = content_logprobs[0].get("top_logprobs", []) + + yes_logprob = None + no_logprob = None + for entry in top_logprobs: + token = entry.get("token", "").lower().strip() + if token == "yes": + yes_logprob = entry["logprob"] + elif token == "no": + no_logprob = entry["logprob"] + + if yes_logprob is not None and no_logprob is not None: + # Compute P(yes) using softmax over yes/no logprobs + import math as _math + max_lp = max(yes_logprob, no_logprob) + yes_exp = _math.exp(yes_logprob - max_lp) + no_exp = _math.exp(no_logprob - max_lp) + return yes_exp / (yes_exp + no_exp) + elif yes_logprob is not None: + return _math.exp(yes_logprob) if yes_logprob > -5 else 0.5 + elif no_logprob is not None: + return 1.0 - (_math.exp(no_logprob) if no_logprob > -5 else 0.5) + + return 0.5 + + except Exception as e: + log.warning(f"Failed to extract score from logprobs: {e}") + return 0.5 + def _format_instruction( self, query: str, doc: str, instruction: Optional[str] ) -> str: @@ -333,6 +420,127 @@ def rerank( raise e + elif self.backend == "jina": + # Jina Reranker API: simple REST call, no prompt template needed. + # The instruction is prepended to the query for context. + if instruction: + full_query = f"{instruction}\n\n{query}" + else: + full_query = query + + all_results = [] + num_docs = len(documents) + num_batches = math.ceil(num_docs / batch_size) + + try: + for i in tqdm( + range(0, num_docs, batch_size), + desc="Reranking Batches (Jina)", + total=num_batches, + disable=num_docs < batch_size, + ): + batch_docs = documents[i : i + batch_size] + payload = { + "model": self.model_name, + "query": full_query, + "documents": batch_docs, + "top_n": len(batch_docs), + } + + response = self.session.post(self.rerank_url, json=payload) + response.raise_for_status() + + data = response.json() + results = data.get("results") + + if results is None or not isinstance(results, list): + log.error( + f"Unexpected response from Jina reranker: {data}" + ) + raise ValueError("Failed to parse 'results' from Jina response.") + + # Map results back to global indices + for r in results: + r["global_index"] = i + r.get("index", 0) + all_results.extend(results) + + # Sort by global index to return scores in original document order + all_results.sort(key=lambda r: r.get("global_index", 0)) + + # Jina scores can be negative; normalize to [0, 1] using sigmoid + all_scores = [ + 1.0 / (1.0 + math.exp(-r["relevance_score"])) + for r in all_results + ] + + return all_scores + + except requests.exceptions.RequestException as e: + log.error(f"Error calling Jina reranker API: {e}") + raise e + + elif self.backend == "openai": + # OpenAI-compatible chat completions backend. + # Uses logprobs on yes/no tokens to compute relevance scores. + if instruction is None: + instruction = "Given a web search query, retrieve relevant passages that answer the query" + + all_scores = [] + num_docs = len(documents) + num_batches = math.ceil(num_docs / batch_size) + + try: + for i in tqdm( + range(0, num_docs, batch_size), + desc="Reranking Batches (OpenAI)", + total=num_batches, + disable=num_docs < batch_size, + ): + batch_docs = documents[i : i + batch_size] + batch_scores = [] + + for doc in batch_docs: + # Format as Qwen3-Reranker expects + prompt = ( + f": {instruction}\n" + f": {query}\n" + f": {doc}" + ) + system_msg = ( + 'Judge whether the Document meets the requirements based on ' + 'the Query and the Instruct provided. Note that the answer ' + 'can only be "yes" or "no".' + ) + + payload = { + "model": self.model_name, + "messages": [ + {"role": "system", "content": system_msg}, + {"role": "user", "content": prompt}, + ], + "max_tokens": 1, + "logprobs": True, + "top_logprobs": 20, + "temperature": 0.0, + "chat_template_kwargs": {"enable_thinking": False}, + } + + response = self.session.post(self.chat_url, json=payload) + response.raise_for_status() + data = response.json() + + # Extract score from logprobs + score = self._extract_score_from_logprobs(data) + batch_scores.append(score) + + all_scores.extend(batch_scores) + + return all_scores + + except requests.exceptions.RequestException as e: + log.error(f"Error calling OpenAI reranker API: {e}") + raise e + elif self.backend == "local": # 1. 创建所有的查询-文档对 pairs = [ diff --git a/Core/provider/vlm.py b/Core/provider/vlm.py index 8c003a6..8779740 100644 --- a/Core/provider/vlm.py +++ b/Core/provider/vlm.py @@ -257,7 +257,11 @@ def generate( ) -> str: content = self._prepare_messages(prompt_or_memory, images) completion = self.client.chat.completions.create( - model=self.model_name, messages=content, temperature=self.temperature + model=self.model_name, messages=content, temperature=self.temperature, + extra_body={ + "enable_thinking": False, # DashScope + "chat_template_kwargs": {"enable_thinking": False}, # vLLM + }, ) if completion.usage: @@ -299,6 +303,10 @@ def generate_json( model=self.model_name, messages=messages, response_format={"type": "json_object"}, # Use modern JSON mode + extra_body={ + "enable_thinking": False, # DashScope + "chat_template_kwargs": {"enable_thinking": False}, # vLLM + }, ) if completion.usage: diff --git a/Core/rag/gbc_answer.py b/Core/rag/gbc_answer.py index 2a88634..3101180 100644 --- a/Core/rag/gbc_answer.py +++ b/Core/rag/gbc_answer.py @@ -13,6 +13,8 @@ VLM_GENERATION_USER_PROMPT, SYNTHESIS_SYS_PROMPT, SYNTHESIS_USER_PROMPT, + get_iter_generation_sys_prompt, + get_synthesis_sys_prompt, ) from Core.utils.utils import num_tokens, TextProcessor from Core.utils.table_utils import table2text @@ -23,9 +25,10 @@ class AnswerAgent: - def __init__(self, llm: LLM, vlm: VLM): + def __init__(self, llm: LLM, vlm: VLM, lang: str = "en"): self.llm = llm self.vlm = vlm + self.lang = lang or "en" def _prepare_evidence( self, retrieved_nodes: List[Dict] @@ -59,6 +62,8 @@ def _build_prompts( """ Builds chunked and formatted prompts for both LLM and VLM. """ + sys_prompt = get_iter_generation_sys_prompt(self.lang) + # 1. Build VLM prompts for image-based evidence image_prompts = [] for node in image_nodes: @@ -69,7 +74,7 @@ def _build_prompts( node_content = node.get("content", "") content = f"An image in Page: {page}, Caption: {node_content}" vlm_prompt = ( - f"{ITER_GENERATION_SYS_PROMPT.strip()}\n\n" + f"{sys_prompt.strip()}\n\n" f"{VLM_GENERATION_USER_PROMPT.format(question=query, content=content).strip()}" ) if img_path: @@ -92,7 +97,7 @@ def _build_prompts( ) + graph_prompt_part ) - system_prompt_tokens = num_tokens(ITER_GENERATION_SYS_PROMPT) + system_prompt_tokens = num_tokens(sys_prompt) content_limit = ( self.llm.config.max_tokens - system_prompt_tokens - base_prompt_tokens - 400 ) # 400 as buffer @@ -148,7 +153,7 @@ def _build_prompts( ) gen_memory = Memory() gen_memory.add( - Message(role="system", content=ITER_GENERATION_SYS_PROMPT) + Message(role="system", content=sys_prompt) ) gen_memory.add(Message(role="user", content=user_prompt)) text_prompts.append(gen_memory) @@ -173,7 +178,7 @@ def _build_prompts( + graph_prompt_part ) gen_memory = Memory() - gen_memory.add(Message(role="system", content=ITER_GENERATION_SYS_PROMPT)) + gen_memory.add(Message(role="system", content=sys_prompt)) gen_memory.add(Message(role="user", content=user_prompt)) text_prompts.append(gen_memory) @@ -237,11 +242,12 @@ def _synthesize_from_chunks( ) log.info("Synthesizing the final answer from partial results...") + synth_sys = get_synthesis_sys_prompt(self.lang) synthesis_user_prompt = SYNTHESIS_USER_PROMPT.format( user_question=query, partial_answers_str=partial_answers_str ) synthesis_memory = Memory() - synthesis_memory.add(Message(role="system", content=SYNTHESIS_SYS_PROMPT)) + synthesis_memory.add(Message(role="system", content=synth_sys)) synthesis_memory.add(Message(role="user", content=synthesis_user_prompt)) try: @@ -493,11 +499,12 @@ def answer_global_question( ) log.info("Synthesizing the final answer from partial results...") + synth_sys = get_synthesis_sys_prompt(self.lang) synthesis_user_prompt = SYNTHESIS_USER_PROMPT.format( user_question=original_query, partial_answers_str=partial_answers_str ) synthesis_memory = Memory() - synthesis_memory.add(Message(role="system", content=SYNTHESIS_SYS_PROMPT)) + synthesis_memory.add(Message(role="system", content=synth_sys)) synthesis_memory.add(Message(role="user", content=synthesis_user_prompt)) try: diff --git a/Core/rag/gbc_rag.py b/Core/rag/gbc_rag.py index 30ca656..395ad8b 100644 --- a/Core/rag/gbc_rag.py +++ b/Core/rag/gbc_rag.py @@ -1,14 +1,11 @@ from collections import defaultdict -from typing import Any, List, Tuple, Dict, Optional - -from regex import F +from typing import Any, List, Dict, Optional from Core.Index.Tree import TreeNode, NodeType from Core.rag.base_rag import BaseRAG from Core.provider.llm import LLM from Core.provider.vlm import VLM from Core.provider.rerank import TextRerankerProvider -from Core.provider.embedding import MMRerankerProvider from Core.configs.rag.gbc_config import GBCRAGConfig from Core.Index.GBCIndex import GBC from Core.prompts.gbc_prompt import ( @@ -29,6 +26,11 @@ SubStep, filter_tree_nodes, ) +from Core.utils.ontology_utils import ( + find_best_graph_ontology_node, + normalize_entity_name, + normalize_entity_type, +) import json @@ -52,6 +54,7 @@ def __init__( vlm: VLM, config: GBCRAGConfig, gbc_index: GBC, + lang: str = "en", ): super().__init__( llm, @@ -71,19 +74,21 @@ def __init__( device=self.cfg.reranker_config.device, backend=self.cfg.reranker_config.backend, api_base=self.cfg.reranker_config.api_base, + api_key=self.cfg.reranker_config.api_key, ) # GBC RAG config self.threshold_e = self.cfg.sim_threshold_e self.select_depth = self.cfg.select_depth self.max_retry = self.cfg.max_retry + self.lang = lang or "en" + # Agents self.planner = TaskPlanner(llm=self.llm) - self.answer = AnswerAgent(llm=self.llm, vlm=self.vlm) + self.answer = AnswerAgent(llm=self.llm, vlm=self.vlm, lang=self.lang) self.retriever = Retriever( varient=self.varient, reranker=self.reranker, - # mm_reranker=self.mm_reranker, embedder=self.embedder, alpha=self.cfg.alpha, topk_ent=self.cfg.topk_ent, @@ -95,37 +100,42 @@ def _get_entity_embed_text(self, entity: QuestionEntity) -> str: return f"Name: {entity.entity_name}\nType: {entity.entity_type}" def _entity_map( - self, entities: List[str], force_one: bool = False + self, entities: List[QuestionEntity], force_one: bool = False ) -> Dict[str, List[str]]: """ Maps entities to their corresponding IDs in the GBC index. Use vdb to find the entity in GBC index. """ entities_str = [self._get_entity_embed_text(entity) for entity in entities] - Qent_GBCent_map = defaultdict(list) + query_to_gbc_entity_map = defaultdict(list) res_list = [] for ent_str in entities_str: query_res = self.gbc_index.entity_vdb.search(query_text=ent_str, top_k=2) + if not query_res: + continue min_distance = query_res[0]["distance"] if query_res else float("inf") - retrieve_name = query_res[0]["metadata"].get("entity_name") - retrieve_type = query_res[0]["metadata"].get("entity_type") + metadata = query_res[0].get("metadata") or {} + retrieve_name = metadata.get("entity_name") + retrieve_type = metadata.get("entity_type") + if not retrieve_name: + continue node_name = self.gbc_index.GraphIndex.get_node_name_from_str( retrieve_name, retrieve_type ) if min_distance < self.threshold_e: - Qent_GBCent_map[ent_str].append(node_name) + query_to_gbc_entity_map[ent_str].append(node_name) log.info(f"Entity '{ent_str}' mapped to GBC entity: {node_name}") else: res_list.append((ent_str, node_name, min_distance)) - if force_one and len(Qent_GBCent_map) == 0 and len(res_list) > 0: + if force_one and len(query_to_gbc_entity_map) == 0 and len(res_list) > 0: # force map the closest entity if no entity is mapped res_list = sorted(res_list, key=lambda x: x[2]) ent_str, node_name, min_distance = res_list[0] - Qent_GBCent_map[ent_str].append(node_name) + query_to_gbc_entity_map[ent_str].append(node_name) log.info(f"Force map entity '{ent_str}' to GBC entity: {node_name}") - return Qent_GBCent_map + return query_to_gbc_entity_map def _get_query_entity(self, query: str) -> Dict[str, List[str]]: """ @@ -137,8 +147,11 @@ def _get_query_entity(self, query: str) -> Dict[str, List[str]]: retrieval_node_names = set() retrieval_nodes = [] for ent_info in retrieval_ents: - ent_name = ent_info["metadata"].get("entity_name") - ent_type = ent_info["metadata"].get("entity_type") + metadata = ent_info.get("metadata") or {} + ent_name = metadata.get("entity_name") + ent_type = metadata.get("entity_type") + if not ent_name: + continue node_dict = { "entity_name": ent_name, "entity_type": ent_type, @@ -176,28 +189,45 @@ def _get_query_entity(self, query: str) -> Dict[str, List[str]]: log.info("Use the question as the entity.") res_entities = [Entity(entity_name=query, entity_type="Question")] - Qent_GBCent_map = defaultdict(list) + query_to_gbc_entity_map = defaultdict(list) remain_ents = [] for res_ent in res_entities: - res_ent.entity_name = res_ent.entity_name.lower() - res_ent.entity_type = res_ent.entity_type.upper() - res_ent.entity_type = res_ent.entity_type.replace(" ", "_") + res_ent.entity_name = normalize_entity_name(res_ent.entity_name) + res_ent.entity_type = normalize_entity_type(res_ent.entity_type) + ent_str = self._get_entity_embed_text(res_ent) + + ontology_cfg = getattr(self.gbc_index.config, "ontology", None) + ontology_node_name = None + if ontology_cfg and ontology_cfg.use_query_resolution: + ontology_node_name = find_best_graph_ontology_node( + graph=self.gbc_index.GraphIndex, + entity_name=res_ent.entity_name, + entity_type=res_ent.entity_type, + threshold=ontology_cfg.mapping_threshold, + ) + if ontology_node_name: + query_to_gbc_entity_map[ent_str].append(ontology_node_name) + log.info( + f"Entity '{ent_str}' mapped to ontology-backed GBC entity: {ontology_node_name}" + ) + continue + ent_node_name = self.gbc_index.GraphIndex.get_node_name_from_entity(res_ent) if ent_node_name in retrieval_node_names: - Qent_GBCent_map[ent_node_name].append(ent_node_name) + query_to_gbc_entity_map[ent_str].append(ent_node_name) log.info( f"Entity '{ent_node_name}' mapped to GBC entity: {ent_node_name}" ) else: remain_ents.append(res_ent) - should_force_one = (len(Qent_GBCent_map) == 0) + should_force_one = len(query_to_gbc_entity_map) == 0 if remain_ents: remain_map = self._entity_map(remain_ents, force_one=should_force_one) for k, v in remain_map.items(): - Qent_GBCent_map[k].extend(v) + query_to_gbc_entity_map[k].extend(v) - return Qent_GBCent_map + return query_to_gbc_entity_map def link_tree_node(self, entities_map: Dict[str, List[str]]) -> List[dict]: """ @@ -214,7 +244,7 @@ def link_tree_node(self, entities_map: Dict[str, List[str]]) -> List[dict]: return [] for node_name in all_map_nodenames: - tree_node_set = self.gbc_index.GraphIndex.NodeName2TreeNodes(node_name) + tree_node_set = self.gbc_index.GraphIndex.node_name_to_tree_nodes(node_name) for node_id in tree_node_set: tree_node_cnt[node_id].append(node_name) @@ -266,7 +296,7 @@ def prep_SecSel_prompt( query, link_nodes: List[TreeNode] = None, remain_nodes: List[TreeNode] = None, - sec_entity_map: Dict[int, List[str]] = None, + sec_entity_map: Optional[Dict[int, List[str]]] = None, ) -> str: """ Prepare the prompt for section selection. @@ -274,7 +304,7 @@ def prep_SecSel_prompt( """ def prep_nodes_json( - nodes: List[TreeNode], sec_entity_map: Dict[int, List[str]] = None + nodes: List[TreeNode], sec_entity_map: Optional[Dict[int, List[str]]] = None ) -> str: node_infos = [] for node in nodes: @@ -429,8 +459,8 @@ def get_GBC_info(self, iter_context: SubStep) -> None: log.info(f"After skyline filtering, select {len(tree_node_ids)} TreeNodes") - Graph_data = self.gbc_index.GraphIndex.get_subgraph_data(res_entities) - iter_context.iteration_graph_nodes = Graph_data.get("nodes", []) + graph_data = self.gbc_index.GraphIndex.get_subgraph_data(res_entities) + iter_context.iteration_graph_nodes = graph_data.get("nodes", []) tree_data = self.gbc_index.TreeIndex.get_nodes_data(tree_node_ids) self._process_retrieved_nodes(tree_data, iter_context) @@ -450,10 +480,10 @@ def _retrieve( iter_context: IterationStep, Iteration context for the current step. """ - Qent_GBCent_map = self._get_query_entity(query) - iter_context.gbc_entity_map = Qent_GBCent_map + query_to_gbc_entity_map = self._get_query_entity(query) + iter_context.gbc_entity_map = query_to_gbc_entity_map - tree_nodes = self.link_tree_node(Qent_GBCent_map) + tree_nodes = self.link_tree_node(query_to_gbc_entity_map) iter_context.linked_tree_nodes = tree_nodes # 3. Use LLM to select the most relevant section or supplementary sections @@ -566,7 +596,8 @@ def process_analysis(self, context: GBCRAGContext, query_analysis: PlanResult): context.final_answer = "I'm sorry, I cannot process this query." def _create_augmented_prompt(self, query: str) -> str: - pass + """Current GBC flow builds prompts via answer agents, so return the raw query.""" + return query def generation(self, query: str, query_output_dir: str): context = GBCRAGContext(query=query) @@ -621,6 +652,4 @@ def _save_retrieval_res(self, context: GBCRAGContext, query_output_dir: str): def close(self): self.embedder.close() self.reranker.close() - # if hasattr(self, 'mm_reranker'): - # self.mm_reranker.close() return super().close() diff --git a/Core/utils/entity_resolution_utils.py b/Core/utils/entity_resolution_utils.py new file mode 100644 index 0000000..d61aa02 --- /dev/null +++ b/Core/utils/entity_resolution_utils.py @@ -0,0 +1,18 @@ +from Core.Index.Graph import Entity +from Core.configs.entity_resolution_config import EntityResolutionConfig + + +def should_resolve_entity_globally( + entity: Entity, resolution_cfg: EntityResolutionConfig +) -> bool: + if not resolution_cfg.canonical_only: + return True + return bool(entity.canonical_id or entity.entity_role == "canonical") + + +def build_global_entity_metadata(entity: Entity, tenant_id: str, doc_id: str) -> dict: + metadata = entity.to_vdb_metadata() + metadata["tenant_id"] = tenant_id or "" + metadata["doc_id"] = doc_id or "" + metadata["canonical_name"] = entity.entity_name + return metadata \ No newline at end of file diff --git a/Core/utils/ontology_utils.py b/Core/utils/ontology_utils.py new file mode 100644 index 0000000..7ea71a9 --- /dev/null +++ b/Core/utils/ontology_utils.py @@ -0,0 +1,158 @@ +from difflib import SequenceMatcher +from typing import TYPE_CHECKING, Iterable, Optional, Tuple + +if TYPE_CHECKING: + from Core.Index.Graph import Graph + + +def normalize_entity_name(value: str) -> str: + return " ".join(str(value or "").strip().split()).lower() + + +def normalize_entity_type(value: str) -> str: + return str(value or "").strip().upper().replace(" ", "_") + + +def dedupe_terms(values: Iterable[str]) -> list[str]: + deduped: list[str] = [] + for value in values: + normalized = normalize_entity_name(value) + if normalized and normalized not in deduped: + deduped.append(normalized) + return deduped + + +def types_compatible(left: str, right: str) -> bool: + left_norm = normalize_entity_type(left) + right_norm = normalize_entity_type(right) + if not left_norm or not right_norm: + return True + if left_norm in {"QUESTION", "UNKNOWN"}: + return True + return left_norm == right_norm + + +def entity_name_similarity(left: str, right: str) -> float: + left_norm = normalize_entity_name(left) + right_norm = normalize_entity_name(right) + if not left_norm or not right_norm: + return 0.0 + if left_norm == right_norm: + return 1.0 + return SequenceMatcher(None, left_norm, right_norm).ratio() + + +def find_best_ontology_match( + entity, ontology_cfg +) -> Tuple[Optional[object], float]: + if not getattr(ontology_cfg, "enabled", False): + return None, 0.0 + + best_match = None + best_score = 0.0 + for candidate in getattr(ontology_cfg, "entities", []): + if getattr(candidate, "status", "active") == "deprecated": + continue + if not types_compatible(entity.entity_type, candidate.entity_type): + continue + candidate_terms = dedupe_terms( + [candidate.entity_name, *candidate.aliases, *getattr(candidate, "keywords", [])] + ) + score = max(entity_name_similarity(entity.entity_name, alias) for alias in candidate_terms) + if score > best_score: + best_match = candidate + best_score = score + + threshold = getattr(ontology_cfg, "mapping_threshold", 1.0) + if best_match is None or best_score < threshold: + return None, best_score + return best_match, best_score + + +def align_entities_to_ontology(entities, relationships, ontology_cfg): + if not getattr(ontology_cfg, "enabled", False) or not getattr(ontology_cfg, "entities", None): + return entities, relationships + + aligned_entities = [] + original_name_map: dict[str, str] = {} + allow_provisional_entities = getattr(ontology_cfg, "allow_provisional_entities", True) + for entity in entities: + original_name = entity.entity_name + matched_entity, confidence = find_best_ontology_match(entity, ontology_cfg) + if matched_entity is not None: + description = matched_entity.description or entity.description + if matched_entity.description and entity.description: + entity_desc = entity.description.strip() + onto_desc = matched_entity.description.strip() + if entity_desc and entity_desc not in onto_desc: + description = f"{onto_desc}\n\nMention detail: {entity_desc}" + aligned = entity.model_copy( + update={ + "entity_name": matched_entity.entity_name, + "entity_type": matched_entity.entity_type, + "description": description, + "entity_id": matched_entity.entity_id, + "canonical_id": matched_entity.entity_id, + "entity_role": "canonical", + "aliases": dedupe_terms( + [entity.entity_name, matched_entity.entity_name, *matched_entity.aliases] + ), + "mapping_confidence": confidence, + "ontology_source": matched_entity.ontology_source, + } + ) + elif allow_provisional_entities: + aligned = entity.model_copy( + update={ + "entity_name": normalize_entity_name(entity.entity_name), + "entity_type": normalize_entity_type(entity.entity_type), + "entity_role": entity.entity_role or "provisional", + "aliases": dedupe_terms([entity.entity_name, *entity.aliases]), + "mapping_confidence": entity.mapping_confidence, + } + ) + else: + continue + original_name_map[original_name] = aligned.entity_name + aligned_entities.append(aligned) + + aligned_relationships = [] + for relationship in relationships: + if ( + relationship.src_entity_name not in original_name_map + or relationship.tgt_entity_name not in original_name_map + ): + continue + aligned_relationship = relationship.model_copy(deep=True) + aligned_relationship.src_entity_name = original_name_map.get( + relationship.src_entity_name, relationship.src_entity_name + ) + aligned_relationship.tgt_entity_name = original_name_map.get( + relationship.tgt_entity_name, relationship.tgt_entity_name + ) + aligned_relationships.append(aligned_relationship) + + return aligned_entities, aligned_relationships + + +def find_best_graph_ontology_node( + graph: "Graph", entity_name: str, entity_type: str, threshold: float = 1.0 +) -> Optional[str]: + best_node_name = None + best_score = 0.0 + for node_name in graph.get_all_nodes(): + entity = graph.get_entity_by_node_name(node_name) + if entity.entity_role != "canonical" and not entity.canonical_id: + continue + if not types_compatible(entity_type, entity.entity_type): + continue + score = max( + entity_name_similarity(entity_name, alias) + for alias in dedupe_terms([entity.entity_name, *entity.aliases]) + ) + if score > best_score: + best_score = score + best_node_name = node_name + if best_score < threshold: + return None + return best_node_name \ No newline at end of file diff --git a/MCP.md b/MCP.md new file mode 100644 index 0000000..b71a380 --- /dev/null +++ b/MCP.md @@ -0,0 +1,668 @@ +# BookRAG MCP Server — Implementation Guide + +> **Model Context Protocol (MCP)** is an open standard by Anthropic that lets AI assistants (Claude Desktop, Cursor, Windsurf, etc.) connect directly to external tools and data sources through a unified interface. This document describes how to expose the BookRAG API as an MCP server so that AI agents can query books, inspect knowledge-graph entities, and manage documents — without any HTTP REST calls. + +--- + +## Table of Contents + +1. [What is MCP?](#1-what-is-mcp) +2. [Why BookRAG Maps Perfectly to MCP](#2-why-bookrag-maps-perfectly-to-mcp) +3. [Architecture](#3-architecture) +4. [MCP Primitives Mapping](#4-mcp-primitives-mapping) +5. [Multi-Tenancy Strategy](#5-multi-tenancy-strategy) +6. [Installation](#6-installation) +7. [File Structure](#7-file-structure) +8. [Implementation: `mcp_server.py`](#8-implementation-mcp_serverpy) +9. [Mounting to the Existing FastAPI App](#9-mounting-to-the-existing-fastapi-app) +10. [Claude Desktop & Cursor Configuration](#10-claude-desktop--cursor-configuration) +11. [Long-Running Operations (Indexing)](#11-long-running-operations-indexing) +12. [Testing](#12-testing) +13. [Transport Options](#13-transport-options) + +--- + +## 1. What is MCP? + +MCP defines three primitive types a server can expose: + +| Primitive | Who controls it | Description | BookRAG example | +|---|---|---|---| +| **Resource** | Application | Read-only contextual data, URI-addressable | Entity list, document status | +| **Tool** | Model (LLM) | Callable functions that take actions | Query a book, rename an entity | +| **Prompt** | User | Reusable prompt templates | "Ask about book", "Find duplicates" | + +MCP is **not** a replacement for REST. It is a parallel interface optimised for AI-agent consumption — same underlying service layer, different transport. + +--- + +## 2. Why BookRAG Maps Perfectly to MCP + +BookRAG's three-layer architecture is already MCP-ready: + +``` +Transport (HTTP REST) → api/routers/ ← keep as-is for users/web UI +Business logic → api/services/ ← shared, zero changes needed +Data stores → FalkorDB, MongoDB, ChromaDB +``` + +Adding MCP means writing a **thin new `mcp_server.py`** that calls the same `api/services/` functions — identical to how the FastAPI routers call them today. + +Key reasons conversion is straightforward: +- All business logic is already **async Python** (`asyncio`) +- Services accept plain Python args — no HTTP concepts leak into them +- Per-document locking, thread pools, and FalkorDB persistence are all inside `api/services/` +- No changes needed to `Core/` indexing pipeline + +--- + +## 3. Architecture + +``` +┌─────────────────────────────────────────────────────┐ +│ AI Agents / Claude Desktop │ +│ (Claude, Cursor, Windsurf, custom) │ +└────────────────────┬────────────────────────────────┘ + │ MCP (stdio or streamable-http) + ▼ + ┌─────────────────┐ + │ mcp_server.py │ ← NEW (thin adapter) + └────────┬────────┘ + │ + ┌────────────▼────────────┐ + │ api/services/ │ ← SHARED (unchanged) + │ entity_editor.py │ + │ chat.py │ + │ indexing.py │ + │ entity_resolution.py │ + └──┬──────────┬──────────┘ + │ │ + ┌──────▼──┐ ┌────▼──────┐ + │FalkorDB │ │ MongoDB │ + │ChromaDB │ │ uploads/ │ + └─────────┘ └───────────┘ + +┌─────────────────────────────────────────────────────┐ +│ Web / Mobile Users │ +└─────────────────┬───────────────────────────────────┘ + │ HTTP REST (JSON) + ▼ + ┌─────────────────┐ + │ api/routers/ │ ← EXISTING FastAPI (unchanged) + └─────────────────┘ +``` + +Both interfaces use **the same service layer** — changes made through MCP are instantly visible through REST and vice versa. + +--- + +## 4. MCP Primitives Mapping + +### Resources (read-only data) + +| URI pattern | Description | Backed by | +|---|---|---| +| `bookrag://documents/{tenant_id}` | List all documents for the tenant | `db.list_documents()` | +| `bookrag://documents/{tenant_id}/{doc_id}` | Single document status | `db.get_document()` | +| `bookrag://entities/{tenant_id}/{doc_id}` | All NER entities for a document | `entity_editor.list_entities()` | + +Resources are **application-controlled**: the AI client decides when to read them as context, without the model explicitly calling a tool. + +### Tools (callable by the model) + +| Tool name | Maps to | Description | +|---|---|---| +| `query_documents` | `chat.handle_query()` | Ask a question against one or more indexed books | +| `rename_entity` | `entity_editor.rename_entity()` | Rename an entity node in the knowledge graph | +| `merge_entities` | `entity_editor.merge_entities()` | Merge ≥ 2 entity nodes into a canonical node | +| `split_entity` | `entity_editor.split_entity()` | Split 1 entity into ≥ 2 new nodes | +| `suggest_merge_candidates` | `entity_editor.suggest_merges()` | Find likely duplicate entities | +| `get_document_status` | `db.get_document()` | Check indexing status of a document | +| `index_document` | `indexing.run_indexing()` | Trigger indexing for an uploaded PDF | + +### Prompts (user-invoked templates) + +| Prompt name | Description | +|---|---| +| `ask_about_book` | Template: "Given document `{doc_id}`, answer: `{question}`" | +| `find_entity_duplicates` | Template: "Review these merge suggestions and decide which to apply" | +| `summarise_entities` | Template: "List the most important entities in `{doc_id}` and explain their roles" | + +--- + +## 5. Multi-Tenancy Strategy + +MCP has **no built-in JWT authentication**. Every service call needs a `tenant_id` and `user_id`. Three options: + +### Option A — Environment-Variable Injection (Recommended) + +Each tenant runs their own MCP server process. The `tenant_id` and `user_id` are injected via environment variables at launch time. + +``` +BOOKRAG_TENANT_ID=acme BOOKRAG_USER_ID=alice python mcp_server.py +``` + +Inside `mcp_server.py`: + +```python +import os +TENANT_ID = os.environ["BOOKRAG_TENANT_ID"] # required — fail fast if missing +USER_ID = os.environ["BOOKRAG_USER_ID"] +``` + +**Pros**: Simple, no auth complexity, works with Claude Desktop `env` block. +**Cons**: One process per tenant — fine for small deployments. + +### Option B — Tool-Argument Injection + +`tenant_id` and `user_id` are required arguments on every tool. The AI model must supply them. + +**Pros**: Single process for all tenants. +**Cons**: Verbose; model must always pass credentials; no real security boundary. + +### Option C — OAuth 2.0 (MCP 1.1+) + +MCP's newer spec supports OAuth 2.0 flows. Suitable for a SaaS product where the MCP server is hosted remotely and multiple organisations connect to it. + +**Pros**: Proper per-user auth, scalable. +**Cons**: Requires implementing an OAuth server; significant complexity. + +> **Recommendation for BookRAG**: Start with **Option A** (env-var injection). It matches exactly how Claude Desktop is configured and requires minimal code. + +--- + +## 6. Installation + +Add the MCP SDK to the project's virtual environment: + +```bash +# Using pip (existing .venv) +pip install "mcp[cli]" + +# Or with uv +uv add "mcp[cli]" +``` + +The `[cli]` extra installs the `mcp` command-line tool needed for the development inspector. + +--- + +## 7. File Structure + +Only **one new file** is needed at the repo root: + +``` +BookRAG/ +├── mcp_server.py ← NEW — MCP adapter (thin layer over api/services/) +├── api/ +│ ├── main.py ← existing FastAPI app (unchanged) +│ ├── services/ ← shared business logic (unchanged) +│ ├── routers/ ← existing REST endpoints (unchanged) +│ └── ... +├── Core/ ← GBC indexing pipeline (unchanged) +└── config/ + └── gbc.yaml ← existing config (unchanged) +``` + +Alternatively, for a remote/production deployment where MCP is mounted directly onto the FastAPI app, no new file is needed — see [Section 9](#9-mounting-to-the-existing-fastapi-app). + +--- + +## 8. Implementation: `mcp_server.py` + +Below is the complete implementation skeleton. It uses the **FastMCP** high-level API (`from mcp.server.fastmcp import FastMCP`) which is analogous to FastAPI's `APIRouter`. + +```python +"""BookRAG MCP Server. + +Usage (local / Claude Desktop): + BOOKRAG_TENANT_ID=acme BOOKRAG_USER_ID=alice python mcp_server.py + +Usage (dev inspector): + BOOKRAG_TENANT_ID=acme BOOKRAG_USER_ID=alice mcp dev mcp_server.py +""" +import json +import os + +from mcp.server.fastmcp import FastMCP, Context + +# ── Tenant identity (injected via environment) ───────────────────────────── +TENANT_ID = os.environ.get("BOOKRAG_TENANT_ID", "default") +USER_ID = os.environ.get("BOOKRAG_USER_ID", "agent") +CONFIG_PATH = os.environ.get("BOOKRAG_CONFIG_PATH", "config/gbc.yaml") + +# ── Service imports (lazy, same as routers do) ────────────────────────────── +import api.services.entity_editor as entity_svc +import api.services.chat as chat_svc +import api.services.indexing as index_svc +import api.db.mongodb as db + +from api.dependencies import MONGO_URI, MONGO_DB_PREFIX + +mcp = FastMCP("bookrag", instructions=( + "BookRAG gives you access to a hierarchical RAG knowledge base built from PDF books. " + "Use query_documents to ask questions. Use entity tools to inspect and curate the " + "knowledge graph extracted from each book." +)) + + +# ════════════════════════════════════════════════════════════════════════════ +# RESOURCES (read-only data — application-controlled) +# ════════════════════════════════════════════════════════════════════════════ + +@mcp.resource("bookrag://documents/{tenant_id}") +async def list_documents_resource(tenant_id: str) -> str: + """Return the list of all documents for the given tenant as JSON.""" + docs = await db.list_documents(MONGO_URI, MONGO_DB_PREFIX, tenant_id, user_id=None) + return json.dumps(docs, default=str) + + +@mcp.resource("bookrag://documents/{tenant_id}/{doc_id}") +async def get_document_resource(tenant_id: str, doc_id: str) -> str: + """Return indexing status and metadata for a single document as JSON.""" + doc = await db.get_document(MONGO_URI, MONGO_DB_PREFIX, tenant_id, doc_id) + return json.dumps(doc, default=str) if doc else json.dumps({"error": "not found"}) + + +@mcp.resource("bookrag://entities/{tenant_id}/{doc_id}") +async def get_entities_resource(tenant_id: str, doc_id: str) -> str: + """Return all NER entities for a document as a JSON array.""" + entities = await entity_svc.list_entities(tenant_id, doc_id, CONFIG_PATH) + return json.dumps(entities, default=str) + + +# ════════════════════════════════════════════════════════════════════════════ +# TOOLS (callable by the model) +# ════════════════════════════════════════════════════════════════════════════ + +@mcp.tool() +async def query_documents( + question: str, + doc_ids: list[str], + session_id: str = "", + cross_doc: bool = False, +) -> str: + """Query one or more indexed books with a natural-language question. + + Args: + question: The question to ask. + doc_ids: List of document IDs to query (must be 'ready' status). + session_id: Optional session ID to continue a conversation thread. + cross_doc: If True, query all docs in parallel and merge answers. + """ + result = await chat_svc.handle_query( + query=question, + tenant_id=TENANT_ID, + user_id=USER_ID, + doc_ids=doc_ids, + session_id=session_id or None, + config_path=CONFIG_PATH, + cross_doc=cross_doc, + ) + return result.get("answer", str(result)) + + +@mcp.tool() +async def get_document_status(doc_id: str) -> str: + """Check the indexing status of a document (pending/indexing/ready/error).""" + doc = await db.get_document(MONGO_URI, MONGO_DB_PREFIX, TENANT_ID, doc_id) + if not doc: + return f"Document '{doc_id}' not found." + return json.dumps({ + "doc_id": doc["doc_id"], + "filename": doc.get("filename", ""), + "status": doc.get("status", "unknown"), + "error": doc.get("error"), + }) + + +@mcp.tool() +async def list_entities(doc_id: str) -> str: + """Return all NER entities extracted from the knowledge graph of a document.""" + entities = await entity_svc.list_entities(TENANT_ID, doc_id, CONFIG_PATH) + return json.dumps(entities, default=str) + + +@mcp.tool() +async def rename_entity( + doc_id: str, + entity_name: str, + entity_type: str, + new_entity_name: str, + new_entity_type: str = "", + new_description: str = "", +) -> str: + """Rename an entity node in the knowledge graph. + + Args: + doc_id: Document the entity belongs to. + entity_name: Current entity name (exact match). + entity_type: Current entity type (e.g. PERSON, ORG). + new_entity_name: New name for the entity. + new_entity_type: New type (leave blank to keep current). + new_description: New description (leave blank to keep current). + """ + updated = await entity_svc.rename_entity( + tenant_id=TENANT_ID, doc_id=doc_id, config_path=CONFIG_PATH, + entity_name=entity_name, entity_type=entity_type, + new_entity_name=new_entity_name, + new_entity_type=new_entity_type or entity_type, + new_description=new_description or None, + user_id=USER_ID, + ) + return json.dumps(updated, default=str) + + +@mcp.tool() +async def merge_entities( + doc_id: str, + source_entities: list[dict], + canonical_name: str, + canonical_type: str, + canonical_description: str = "", +) -> str: + """Merge two or more entity nodes into a single canonical entity. + + Args: + doc_id: Document containing the entities. + source_entities: List of {"entity_name": ..., "entity_type": ...} dicts. + canonical_name: Name of the resulting merged entity. + canonical_type: Type of the resulting merged entity. + canonical_description: Optional description for the canonical entity. + """ + updated = await entity_svc.merge_entities( + tenant_id=TENANT_ID, doc_id=doc_id, config_path=CONFIG_PATH, + source_entities=source_entities, + canonical_name=canonical_name, + canonical_type=canonical_type, + canonical_desc=canonical_description, + user_id=USER_ID, + ) + return json.dumps(updated, default=str) + + +@mcp.tool() +async def suggest_merge_candidates( + doc_id: str, + min_score: float = 0.80, + top_k: int = 20, + use_embeddings: bool = False, +) -> str: + """Return a ranked list of entity pairs that may be duplicates. + + Args: + doc_id: Document to analyse. + min_score: Minimum similarity score (0.0 – 1.0). Default 0.80. + top_k: Maximum number of suggestions to return. + use_embeddings: If True, also run embedding-based similarity (slower). + """ + suggestions = await entity_svc.suggest_merges( + tenant_id=TENANT_ID, doc_id=doc_id, config_path=CONFIG_PATH, + min_score=min_score, top_k=top_k, use_embeddings=use_embeddings, + ) + return json.dumps(suggestions, default=str) + + +@mcp.tool() +async def index_document( + doc_id: str, + pdf_path: str, + ctx: Context, +) -> str: + """Trigger GBC index build for an already-uploaded PDF. + + This is a long-running operation. Progress is reported via MCP notifications. + + Args: + doc_id: The document ID (must already exist in MongoDB). + pdf_path: Absolute path to the PDF file on the server. + """ + await ctx.report_progress(0, 100, "Starting indexing...") + try: + await index_svc.run_indexing(TENANT_ID, doc_id, pdf_path, CONFIG_PATH) + await ctx.report_progress(100, 100, "Indexing complete.") + return f"Document '{doc_id}' indexed successfully." + except Exception as exc: + return f"Indexing failed: {exc}" + + +# ════════════════════════════════════════════════════════════════════════════ +# PROMPTS (user-invoked templates) +# ════════════════════════════════════════════════════════════════════════════ + +@mcp.prompt() +def ask_about_book(doc_id: str, question: str) -> str: + """Generate a prompt to ask a question about a specific indexed book.""" + return ( + f"You have access to the BookRAG knowledge base for document '{doc_id}'.\n\n" + f"Please use the `query_documents` tool with doc_ids=['{doc_id}'] to answer:\n\n" + f"{question}" + ) + + +@mcp.prompt() +def find_entity_duplicates(doc_id: str) -> str: + """Generate a prompt to review and resolve duplicate entities.""" + return ( + f"You are reviewing the knowledge graph for document '{doc_id}'.\n\n" + f"1. Call `suggest_merge_candidates(doc_id='{doc_id}', min_score=0.80)` to get candidates.\n" + f"2. Review each pair. For genuine duplicates, call `merge_entities`.\n" + f"3. Report a summary of what was merged and why." + ) + + +# ════════════════════════════════════════════════════════════════════════════ +# ENTRY POINT +# ════════════════════════════════════════════════════════════════════════════ + +if __name__ == "__main__": + # Default: stdio transport for Claude Desktop / local use + mcp.run(transport="stdio") +``` + +--- + +## 9. Mounting to the Existing FastAPI App + +For **remote / production** deployments where you want one process serving both REST and MCP, mount the MCP server directly inside `api/main.py`: + +```python +# api/main.py (addition only — all existing code unchanged) +from mcp.server.fastmcp import FastMCP + +# Import the mcp instance from your server module +from mcp_server import mcp # the FastMCP instance defined above + +# Mount under /mcp — accessible at http://host:8000/mcp +app.mount("/mcp", mcp.streamable_http_app()) +``` + +The MCP endpoint is then reachable at `http://your-server:8000/mcp` using the **streamable-http** transport. AI clients connect to this URL rather than launching a subprocess. + +> **Note**: When mounted to FastAPI, the `mcp_server.py` `if __name__ == "__main__"` block is never executed. Uvicorn/Gunicorn drives everything. + +--- + +## 10. Claude Desktop & Cursor Configuration + +### Claude Desktop + +Edit `~/.config/claude/claude_desktop_config.json` (Linux/macOS) or `%APPDATA%\Claude\claude_desktop_config.json` (Windows): + +```json +{ + "mcpServers": { + "bookrag-acme": { + "command": "/path/to/BookRAG/.venv/bin/python", + "args": ["/path/to/BookRAG/mcp_server.py"], + "env": { + "BOOKRAG_TENANT_ID": "acme", + "BOOKRAG_USER_ID": "alice", + "BOOKRAG_CONFIG_PATH": "/path/to/BookRAG/config/gbc.yaml", + "BOOKRAG_UPLOAD_DIR": "/path/to/BookRAG/uploads", + "BOOKRAG_INDEX_DIR": "/path/to/BookRAG/indices", + "BOOKRAG_FALKORDB_HOST": "localhost", + "BOOKRAG_FALKORDB_PORT": "6379", + "MONGO_URI": "mongodb://localhost:27017" + } + } + } +} +``` + +Restart Claude Desktop. The BookRAG tools will appear in the 🔧 tools panel. + +To support multiple tenants in Claude Desktop, add multiple entries with different `BOOKRAG_TENANT_ID` / `BOOKRAG_USER_ID` values: + +```json +{ + "mcpServers": { + "bookrag-acme": { "command": "...", "env": { "BOOKRAG_TENANT_ID": "acme", ... } }, + "bookrag-beta": { "command": "...", "env": { "BOOKRAG_TENANT_ID": "beta", ... } } + } +} +``` + +### Cursor / Windsurf + +In your project's `.cursor/mcp.json` (or Windsurf equivalent): + +```json +{ + "mcpServers": { + "bookrag": { + "command": ".venv/bin/python", + "args": ["mcp_server.py"], + "env": { + "BOOKRAG_TENANT_ID": "dev", + "BOOKRAG_USER_ID": "cursor-agent", + "BOOKRAG_CONFIG_PATH": "config/gbc.yaml" + } + } + } +} +``` + +### Remote (Streamable HTTP) Client Config + +For clients that support the streamable-http transport (custom agents, LangChain, PydanticAI): + +```python +from mcp.client.streamable_http import streamable_http_client +from mcp import ClientSession + +async with streamable_http_client("http://your-server:8000/mcp") as (read, write, _): + async with ClientSession(read, write) as session: + await session.initialize() + result = await session.call_tool("query_documents", { + "question": "Who is the main antagonist?", + "doc_ids": ["doc-abc123"], + }) + print(result.content[0].text) +``` + +--- + +## 11. Long-Running Operations (Indexing) + +MCP tools are **request/response** — the client waits for the tool to return. Indexing a large PDF can take minutes. The `index_document` tool handles this with progress reporting: + +```python +@mcp.tool() +async def index_document(doc_id: str, pdf_path: str, ctx: Context) -> str: + await ctx.report_progress(0, 100, "Starting indexing...") + await index_svc.run_indexing(TENANT_ID, doc_id, pdf_path, CONFIG_PATH) + await ctx.report_progress(100, 100, "Done.") + return f"Document '{doc_id}' indexed successfully." +``` + +`ctx.report_progress(current, total, message)` sends MCP progress notifications that Claude Desktop displays as a progress bar. The client tool call remains open until the function returns. + +For very long operations (> 5 min), the recommended pattern is: +1. Launch indexing as a background task (already done in `run_indexing`) +2. Return immediately with `"Indexing started. Call get_document_status('{doc_id}') to check progress."` +3. The model can poll `get_document_status` in subsequent turns. + +--- + +## 12. Testing + +### Interactive MCP Inspector (recommended first step) + +```bash +cd /path/to/BookRAG +BOOKRAG_TENANT_ID=dev BOOKRAG_USER_ID=test \ + mcp dev mcp_server.py +``` + +This opens a web UI at `http://localhost:5173` where you can: +- Browse all registered Resources, Tools, and Prompts +- Call any tool interactively and inspect the JSON response +- No Claude Desktop or Cursor needed + +### Quick smoke-test with the MCP Python client + +```python +# test_mcp_client.py +import asyncio +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client + +async def main(): + params = StdioServerParameters( + command=".venv/bin/python", + args=["mcp_server.py"], + env={ + "BOOKRAG_TENANT_ID": "test", + "BOOKRAG_USER_ID": "tester", + "BOOKRAG_CONFIG_PATH": "config/gbc.yaml", + }, + ) + async with stdio_client(params) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + + tools = await session.list_tools() + print("Tools:", [t.name for t in tools.tools]) + + resources = await session.list_resources() + print("Resources:", [r.uri for r in resources.resources]) + +asyncio.run(main()) +``` + +Run with: + +```bash +python test_mcp_client.py +``` + +--- + +## 13. Transport Options + +| Transport | Use case | How to run | +|---|---|---| +| **stdio** | Local: Claude Desktop, Cursor, dev | `mcp.run(transport="stdio")` (default) | +| **streamable-http** | Remote: hosted server, custom agents | `app.mount("/mcp", mcp.streamable_http_app())` | +| **SSE** | Legacy remote (older MCP clients) | `mcp.run(transport="sse")` | + +For most BookRAG deployments: +- **Development / single user**: stdio via Claude Desktop config +- **Team / production**: mount streamable-http on the existing FastAPI app at `/mcp` + +--- + +## Summary + +| Step | Action | +|---|---| +| 1 | `pip install "mcp[cli]"` | +| 2 | Create `mcp_server.py` (copy skeleton from Section 8) | +| 3 | Set env vars: `BOOKRAG_TENANT_ID`, `BOOKRAG_USER_ID`, `BOOKRAG_CONFIG_PATH` | +| 4 | Test with `mcp dev mcp_server.py` | +| 5 | Add to Claude Desktop config (Section 10) | +| 6 | *(Optional)* Mount to FastAPI for remote access (Section 9) | + +**Zero changes** to `Core/`, `api/services/`, `api/routers/`, or any existing behaviour are required. + diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/db/__init__.py b/api/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/db/mongodb.py b/api/db/mongodb.py new file mode 100644 index 0000000..cddb116 --- /dev/null +++ b/api/db/mongodb.py @@ -0,0 +1,317 @@ +"""Async MongoDB client and CRUD helpers using Motor.""" +import hashlib +import logging +import os +from typing import List, Optional +from datetime import datetime, timezone + +import pymongo +from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase + +log = logging.getLogger(__name__) + +_client: Optional[AsyncIOMotorClient] = None + + +def get_client(uri: str) -> AsyncIOMotorClient: + global _client + if _client is None: + _client = AsyncIOMotorClient(uri) + return _client + + +def get_system_db(uri: str, system_db: str) -> AsyncIOMotorDatabase: + return get_client(uri)[system_db] + + +def get_tenant_db(uri: str, db_prefix: str, tenant_id: str) -> AsyncIOMotorDatabase: + return get_client(uri)[f"{db_prefix}_{tenant_id}"] + + +async def close_client(): + global _client + if _client: + _client.close() + _client = None + + +async def ensure_indexes(uri: str, system_db: str, db_prefix: str, tenant_ids: List[str]): + """Create MongoDB indexes for all known tenant databases. + + Called once at startup. Indexes are idempotent — ``create_index`` is a no-op + if the index already exists. + """ + client = get_client(uri) + + # System DB indexes + sdb = client[system_db] + await sdb["tenants"].create_index("tenant_id", unique=True) + log.info(f"Ensured indexes on system db '{system_db}'") + + # Per-tenant DB indexes + for tid in tenant_ids: + tdb = client[f"{db_prefix}_{tid}"] + await tdb["users"].create_index("username", unique=True) + await tdb["users"].create_index("user_id", unique=True) + await tdb["documents"].create_index("doc_id", unique=True) + await tdb["permissions"].create_index( + [("user_id", pymongo.ASCENDING), ("doc_id", pymongo.ASCENDING)], + unique=True, + ) + await tdb["sessions"].create_index("session_id", unique=True) + await tdb["sessions"].create_index("user_id") + await tdb["entity_edits"].create_index("doc_id") + await tdb["entity_edits"].create_index("ts") + # Refresh token revocation index + await tdb["refresh_tokens"].create_index("token_hash", unique=True) + await tdb["refresh_tokens"].create_index("user_id") + await tdb["refresh_tokens"].create_index("expires_at", expireAfterSeconds=0) + log.info(f"Ensured indexes on {len(tenant_ids)} tenant database(s)") + + +# ── Tenant CRUD ────────────────────────────────────────────────────────────── + +async def create_tenant(uri: str, system_db: str, tenant_data: dict) -> str: + db = get_system_db(uri, system_db) + tenant_data["created_at"] = datetime.now(timezone.utc) + result = await db["tenants"].insert_one(tenant_data) + return str(result.inserted_id) + + +async def get_tenant(uri: str, system_db: str, tenant_id: str) -> Optional[dict]: + db = get_system_db(uri, system_db) + return await db["tenants"].find_one({"tenant_id": tenant_id}) + + +# ── User CRUD ───────────────────────────────────────────────────────────────── + +async def create_user(uri: str, db_prefix: str, tenant_id: str, user_data: dict) -> str: + db = get_tenant_db(uri, db_prefix, tenant_id) + user_data["created_at"] = datetime.now(timezone.utc) + result = await db["users"].insert_one(user_data) + return str(result.inserted_id) + + +async def get_user_by_username(uri: str, db_prefix: str, tenant_id: str, username: str) -> Optional[dict]: + db = get_tenant_db(uri, db_prefix, tenant_id) + return await db["users"].find_one({"username": username}) + + +# ── Document CRUD ───────────────────────────────────────────────────────────── + +async def create_document(uri: str, db_prefix: str, tenant_id: str, doc_data: dict) -> str: + db = get_tenant_db(uri, db_prefix, tenant_id) + doc_data["created_at"] = datetime.now(timezone.utc) + doc_data["status"] = "pending" + result = await db["documents"].insert_one(doc_data) + return str(result.inserted_id) + + +async def update_document_status(uri: str, db_prefix: str, tenant_id: str, doc_id: str, status: str, error: str = None): + db = get_tenant_db(uri, db_prefix, tenant_id) + update = {"$set": {"status": status, "updated_at": datetime.now(timezone.utc)}} + if error: + update["$set"]["error"] = error + await db["documents"].update_one({"doc_id": doc_id}, update) + + +async def get_document(uri: str, db_prefix: str, tenant_id: str, doc_id: str) -> Optional[dict]: + db = get_tenant_db(uri, db_prefix, tenant_id) + return await db["documents"].find_one({"doc_id": doc_id}) + + +async def get_document_raw_path(uri: str, db_prefix: str, tenant_id: str, doc_id: str) -> Optional[str]: + """Return the raw PDF path stored at upload time, or None if not found.""" + doc = await get_document(uri, db_prefix, tenant_id, doc_id) + return doc.get("pdf_path") if doc else None + + +async def list_documents( + uri: str, db_prefix: str, tenant_id: str, user_id: str, + limit: int = 50, offset: int = 0, +) -> tuple[list[dict], int]: + """Return paginated docs accessible to *user_id*, sorted by document_date desc. + + Returns ``(docs, total_count)``. Documents without ``document_date`` + fall back to ``created_at``; documents with neither sort last. + """ + db = get_tenant_db(uri, db_prefix, tenant_id) + perm_cursor = db["permissions"].find({"user_id": user_id}) + doc_ids = [p["doc_id"] async for p in perm_cursor] + filt = {"doc_id": {"$in": doc_ids}} + total = await db["documents"].count_documents(filt) + cursor = ( + db["documents"] + .find(filt) + .sort([("document_date", pymongo.DESCENDING), ("created_at", pymongo.DESCENDING)]) + .skip(offset) + .limit(limit) + ) + docs = [d async for d in cursor] + return docs, total + + +async def delete_document(uri: str, db_prefix: str, tenant_id: str, doc_id: str): + """Delete a document and all associated permissions, sessions, entity edits.""" + db = get_tenant_db(uri, db_prefix, tenant_id) + await db["documents"].delete_one({"doc_id": doc_id}) + await db["permissions"].delete_many({"doc_id": doc_id}) + await db["entity_edits"].delete_many({"doc_id": doc_id}) + log.info(f"Deleted document '{doc_id}' and related records from tenant '{tenant_id}'") + + +# ── Permission CRUD ─────────────────────────────────────────────────────────── + +async def grant_permission(uri: str, db_prefix: str, tenant_id: str, user_id: str, doc_id: str, role: str = "reader"): + db = get_tenant_db(uri, db_prefix, tenant_id) + await db["permissions"].update_one( + {"user_id": user_id, "doc_id": doc_id}, + {"$set": {"role": role, "updated_at": datetime.now(timezone.utc)}}, + upsert=True, + ) + + +async def get_accessible_doc_ids(uri: str, db_prefix: str, tenant_id: str, user_id: str) -> List[str]: + db = get_tenant_db(uri, db_prefix, tenant_id) + cursor = db["permissions"].find({"user_id": user_id}) + return [p["doc_id"] async for p in cursor] + + +async def get_permission(uri: str, db_prefix: str, tenant_id: str, user_id: str, doc_id: str) -> Optional[dict]: + """Return the permission record for a user+doc pair, or None.""" + db = get_tenant_db(uri, db_prefix, tenant_id) + return await db["permissions"].find_one({"user_id": user_id, "doc_id": doc_id}) + + +# ── Session / Message CRUD ──────────────────────────────────────────────────── + +async def create_session(uri: str, db_prefix: str, tenant_id: str, session_data: dict) -> str: + db = get_tenant_db(uri, db_prefix, tenant_id) + session_data["created_at"] = datetime.now(timezone.utc) + result = await db["sessions"].insert_one(session_data) + return str(result.inserted_id) + + +# Max messages per session — prevents unbounded array growth (16 MB doc limit). +# Each user+assistant turn = 2 messages, so 200 = ~100 turns. +_SESSION_MSG_CAP = int(os.environ.get("BOOKRAG_SESSION_MSG_CAP", "200")) + + +async def append_message(uri: str, db_prefix: str, tenant_id: str, session_id: str, message: dict): + db = get_tenant_db(uri, db_prefix, tenant_id) + message["ts"] = datetime.now(timezone.utc) + await db["sessions"].update_one( + {"session_id": session_id}, + {"$push": {"messages": {"$each": [message], "$slice": -_SESSION_MSG_CAP}}}, + ) + + +async def get_session(uri: str, db_prefix: str, tenant_id: str, session_id: str) -> Optional[dict]: + db = get_tenant_db(uri, db_prefix, tenant_id) + return await db["sessions"].find_one({"session_id": session_id}) + + +async def list_sessions( + uri: str, db_prefix: str, tenant_id: str, user_id: str, + limit: int = 50, offset: int = 0, +) -> tuple[list[dict], int]: + """Return paginated sessions for *user_id*, newest first.""" + db = get_tenant_db(uri, db_prefix, tenant_id) + filt = {"user_id": user_id} + total = await db["sessions"].count_documents(filt) + cursor = ( + db["sessions"] + .find(filt, {"messages": 0}) # exclude messages array for listing + .sort("created_at", pymongo.DESCENDING) + .skip(offset) + .limit(limit) + ) + sessions = [s async for s in cursor] + return sessions, total + + +async def delete_session(uri: str, db_prefix: str, tenant_id: str, session_id: str): + """Delete a session and all its messages.""" + db = get_tenant_db(uri, db_prefix, tenant_id) + await db["sessions"].delete_one({"session_id": session_id}) + + +async def recover_stale_indexing(uri: str, db_prefix: str, tenant_ids: List[str]) -> int: + """Reset docs stuck in 'indexing' status to 'error' — called at startup. + + If the server crashed mid-indexing, those docs will never complete. + Returns the total number of documents recovered across all tenants. + """ + client = get_client(uri) + recovered = 0 + for tid in tenant_ids: + tdb = client[f"{db_prefix}_{tid}"] + result = await tdb["documents"].update_many( + {"status": "indexing"}, + {"$set": { + "status": "error", + "error": "Indexing interrupted by server restart — please re-upload", + "updated_at": datetime.now(timezone.utc), + }}, + ) + recovered += result.modified_count + return recovered + + +# ── Entity Edit Audit Log ───────────────────────────────────────────────────── + +async def log_entity_edit(uri: str, db_prefix: str, tenant_id: str, edit_record: dict): + """Write a lightweight audit entry for any entity edit operation. + + ``edit_record`` should contain at minimum: + - ``operation``: one of "rename" | "merge" | "split" + - ``doc_id``: document scope of the edit + - ``user_id``: who performed the edit + - ``before``: snapshot of the entity/entities before the change + - ``after``: snapshot of the entity/entities after the change + """ + db = get_tenant_db(uri, db_prefix, tenant_id) + edit_record["ts"] = datetime.now(timezone.utc) + await db["entity_edits"].insert_one(edit_record) + + + +# ── Refresh Token Management ──────────────────────────────────────────────── + +def _hash_token(token: str) -> str: + """SHA-256 hash of the raw refresh token — we never store plaintext.""" + return hashlib.sha256(token.encode()).hexdigest() + + +async def store_refresh_token( + uri: str, db_prefix: str, tenant_id: str, + user_id: str, token: str, expires_at: datetime, +): + """Store a hashed refresh token so it can be revoked later.""" + db = get_tenant_db(uri, db_prefix, tenant_id) + await db["refresh_tokens"].insert_one({ + "token_hash": _hash_token(token), + "user_id": user_id, + "created_at": datetime.now(timezone.utc), + "expires_at": expires_at, + }) + + +async def is_refresh_token_valid(uri: str, db_prefix: str, tenant_id: str, token: str) -> bool: + """Return True if the token hash exists (i.e. has NOT been revoked).""" + db = get_tenant_db(uri, db_prefix, tenant_id) + doc = await db["refresh_tokens"].find_one({"token_hash": _hash_token(token)}) + return doc is not None + + +async def revoke_refresh_token(uri: str, db_prefix: str, tenant_id: str, token: str): + """Revoke a single refresh token by removing it.""" + db = get_tenant_db(uri, db_prefix, tenant_id) + await db["refresh_tokens"].delete_one({"token_hash": _hash_token(token)}) + + +async def revoke_all_refresh_tokens(uri: str, db_prefix: str, tenant_id: str, user_id: str): + """Revoke all refresh tokens for a user (e.g. password change).""" + db = get_tenant_db(uri, db_prefix, tenant_id) + result = await db["refresh_tokens"].delete_many({"user_id": user_id}) + log.info(f"Revoked {result.deleted_count} refresh tokens for user {user_id}") diff --git a/api/dependencies.py b/api/dependencies.py new file mode 100644 index 0000000..a71feaf --- /dev/null +++ b/api/dependencies.py @@ -0,0 +1,192 @@ +"""FastAPI dependency injection: JWT verification, DB handles, permission checks.""" +import os +import logging +import time +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor +from typing import Optional +from fastapi import Depends, HTTPException, Request, status +from fastapi.security import OAuth2PasswordBearer +from jose import JWTError, jwt +from passlib.context import CryptContext + +from api.db import mongodb as db + +log = logging.getLogger(__name__) + +# ── Config (read from env with sensible defaults) ───────────────────────────── +_secret = os.getenv("BOOKRAG_SECRET_KEY", "") +if not _secret: + raise RuntimeError( + "BOOKRAG_SECRET_KEY environment variable is not set. " + "Generate a secure key with: python -c \"import secrets; print(secrets.token_urlsafe(64))\" " + "and export it before starting the server." + ) +SECRET_KEY = _secret +ALGORITHM = "HS256" +ACCESS_TOKEN_EXPIRE_MINUTES = int(os.getenv("BOOKRAG_TOKEN_EXPIRE", "60")) +REFRESH_TOKEN_EXPIRE_DAYS = int(os.getenv("BOOKRAG_REFRESH_TOKEN_DAYS", "7")) + +MONGO_URI = os.getenv("BOOKRAG_MONGO_URI", "mongodb://localhost:27017") +MONGO_DB_PREFIX = os.getenv("BOOKRAG_MONGO_PREFIX", "bookrag") +MONGO_SYSTEM_DB = os.getenv("BOOKRAG_MONGO_SYSTEM_DB", "bookrag_system") + +FALKORDB_HOST = os.getenv("BOOKRAG_FALKORDB_HOST", "localhost") +FALKORDB_PORT = int(os.getenv("BOOKRAG_FALKORDB_PORT", "6379")) +FALKORDB_USERNAME = os.getenv("BOOKRAG_FALKORDB_USERNAME", "") +FALKORDB_PASSWORD = os.getenv("BOOKRAG_FALKORDB_PASSWORD", "") + +UPLOAD_DIR = os.getenv("BOOKRAG_UPLOAD_DIR", "./uploads") +INDEX_SAVE_DIR = os.getenv("BOOKRAG_INDEX_DIR", "./indices") + +# ── Shared thread pool ─────────────────────────────────────────────────────── +# Single GPU-aware pool shared by chat, indexing, entity_editor, entity_resolution. +# Size is tunable via env var — default 4 workers (matches original chat pool). +THREAD_POOL = ThreadPoolExecutor( + max_workers=int(os.getenv("BOOKRAG_THREAD_POOL_SIZE", "4")), + thread_name_prefix="bookrag", +) + +# ── Auth helpers ────────────────────────────────────────────────────────────── +pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") +oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/auth/login") + + +def hash_password(password: str) -> str: + return pwd_context.hash(password) + + +def verify_password(plain: str, hashed: str) -> bool: + return pwd_context.verify(plain, hashed) + + +def create_access_token(data: dict) -> str: + from datetime import datetime, timedelta, timezone + payload = data.copy() + payload["exp"] = datetime.now(timezone.utc) + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES) + payload["type"] = "access" + return jwt.encode(payload, SECRET_KEY, algorithm=ALGORITHM) + + +def create_refresh_token(data: dict) -> str: + """Create a long-lived refresh token (default 7 days).""" + from datetime import datetime, timedelta, timezone + payload = data.copy() + payload["exp"] = datetime.now(timezone.utc) + timedelta(days=REFRESH_TOKEN_EXPIRE_DAYS) + payload["type"] = "refresh" + return jwt.encode(payload, SECRET_KEY, algorithm=ALGORITHM) + + +def decode_refresh_token(token: str) -> dict: + """Decode and validate a refresh token. Raises HTTPException on failure.""" + try: + payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM]) + if payload.get("type") != "refresh": + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid token type — expected refresh token", + ) + return payload + except JWTError: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired refresh token", + ) + + +# ── Current-user dependency ─────────────────────────────────────────────────── + +async def get_current_user(token: str = Depends(oauth2_scheme)) -> dict: + credentials_exc = HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Could not validate credentials", + headers={"WWW-Authenticate": "Bearer"}, + ) + try: + payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM]) + # Reject refresh tokens used as access tokens + if payload.get("type") == "refresh": + raise credentials_exc + user_id: str = payload.get("sub") + tenant_id: str = payload.get("tenant_id") + role: str = payload.get("role", "user") + if not user_id or not tenant_id: + raise credentials_exc + except JWTError: + raise credentials_exc + return {"user_id": user_id, "tenant_id": tenant_id, "role": role} + + +async def require_admin(current_user: dict = Depends(get_current_user)) -> dict: + if current_user.get("role") != "admin": + raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Admin access required") + return current_user + + +# ── Permission check ───────────────────────────────────────────────────────── + +async def check_doc_access(user_id: str, tenant_id: str, doc_id: str) -> bool: + accessible = await db.get_accessible_doc_ids(MONGO_URI, MONGO_DB_PREFIX, tenant_id, user_id) + return doc_id in accessible + + +async def filter_accessible_docs(user_id: str, tenant_id: str, requested_doc_ids: Optional[list]) -> list: + """Return intersection of requested_doc_ids with what user can access. If requested is None, return all accessible.""" + accessible = await db.get_accessible_doc_ids(MONGO_URI, MONGO_DB_PREFIX, tenant_id, user_id) + if requested_doc_ids is None: + return accessible + return [d for d in requested_doc_ids if d in accessible] + + +# ── In-memory sliding-window rate limiter ──────────────────────────────────── + +_LOGIN_RPM = int(os.getenv("BOOKRAG_LOGIN_RPM", "10")) # login attempts per minute per IP +_QUERY_RPM = int(os.getenv("BOOKRAG_QUERY_RPM", "30")) # chat queries per minute per user +_WINDOW = 60.0 # seconds + + +class _RateBucket: + """Sliding-window counter per key.""" + + __slots__ = ("_hits",) + + def __init__(self): + self._hits: dict[str, list[float]] = defaultdict(list) + + def check(self, key: str, limit: int) -> bool: + """Return True if the request should be allowed.""" + now = time.monotonic() + window = self._hits[key] + # Prune expired timestamps + cutoff = now - _WINDOW + self._hits[key] = window = [t for t in window if t > cutoff] + if len(window) >= limit: + return False + window.append(now) + return True + + +_login_bucket = _RateBucket() +_query_bucket = _RateBucket() + + +async def rate_limit_login(request: Request): + """Dependency: enforce per-IP rate limit on login.""" + client_ip = request.client.host if request.client else "unknown" + if not _login_bucket.check(client_ip, _LOGIN_RPM): + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail=f"Too many login attempts. Try again in {int(_WINDOW)} seconds.", + ) + + +async def rate_limit_query(current_user: dict = Depends(get_current_user)): + """Dependency: enforce per-user rate limit on chat queries.""" + key = f"{current_user['tenant_id']}:{current_user['user_id']}" + if not _query_bucket.check(key, _QUERY_RPM): + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail=f"Too many requests. Try again in {int(_WINDOW)} seconds.", + ) + return current_user + diff --git a/api/main.py b/api/main.py new file mode 100644 index 0000000..296df1a --- /dev/null +++ b/api/main.py @@ -0,0 +1,208 @@ +"""BookRAG FastAPI application entry point.""" +from dotenv import load_dotenv +load_dotenv() # load .env before any os.getenv / os.environ calls + +import logging +import os +import uuid as _uuid +import json +from contextlib import asynccontextmanager + +import yaml +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.requests import Request +from starlette.responses import Response +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from api.db import mongodb as db +from api.dependencies import MONGO_URI, MONGO_DB_PREFIX, MONGO_SYSTEM_DB, THREAD_POOL +from api.routers import auth, documents, chat, tenants, entities + + +# ── Structured JSON logging ────────────────────────────────────────────────── + +class _JSONFormatter(logging.Formatter): + """Emit each log record as a single JSON line.""" + + def format(self, record: logging.LogRecord) -> str: + obj = { + "ts": self.formatTime(record, self.datefmt), + "level": record.levelname, + "logger": record.name, + "msg": record.getMessage(), + } + if hasattr(record, "request_id"): + obj["request_id"] = record.request_id + if record.exc_info and record.exc_info[0] is not None: + obj["exc"] = self.formatException(record.exc_info) + return json.dumps(obj, default=str) + + +_log_level = os.getenv("BOOKRAG_LOG_LEVEL", "INFO").upper() +_handler = logging.StreamHandler() +_handler.setFormatter(_JSONFormatter()) +logging.root.handlers = [_handler] +logging.root.setLevel(getattr(logging, _log_level, logging.INFO)) +log = logging.getLogger(__name__) + + +# ── Config validation ──────────────────────────────────────────────────────── + +_CONFIG_PATH = os.getenv("BOOKRAG_CONFIG_PATH", "config/gbc.yaml") +_CONFIG_REQUIRED_KEYS = {"llm", "vlm"} # top-level sections that must exist + + +def _validate_config(path: str): + """Validate YAML config at startup — fail fast on missing required sections.""" + if not os.path.isfile(path): + raise RuntimeError(f"Config file not found: {path}") + with open(path) as f: + raw = yaml.safe_load(f) + if not isinstance(raw, dict): + raise RuntimeError(f"Config file is not a valid YAML mapping: {path}") + missing = _CONFIG_REQUIRED_KEYS - raw.keys() + if missing: + raise RuntimeError(f"Config file '{path}' is missing required sections: {missing}") + log.info(f"Config validated: {path}") + + +# ── Request-ID middleware ──────────────────────────────────────────────────── + +class _RequestIDMiddleware(BaseHTTPMiddleware): + """Inject an ``X-Request-ID`` header (echo or generate) and attach to log context.""" + + async def dispatch(self, request: Request, call_next): + req_id = request.headers.get("X-Request-ID") or str(_uuid.uuid4()) + # Make request_id available in logging context (thread-local filter) + _request_id_ctx.set(req_id) + response: Response = await call_next(request) + response.headers["X-Request-ID"] = req_id + return response + + +import contextvars +_request_id_ctx: contextvars.ContextVar[str] = contextvars.ContextVar("request_id", default="-") + + +class _RequestIDFilter(logging.Filter): + def filter(self, record: logging.LogRecord) -> bool: + record.request_id = _request_id_ctx.get("-") # type: ignore[attr-defined] + return True + + +logging.root.addFilter(_RequestIDFilter()) + + +# ── Lifespan ───────────────────────────────────────────────────────────────── + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Startup and shutdown lifecycle.""" + log.info("BookRAG API starting up...") + # Validate config + _validate_config(_CONFIG_PATH) + + # Ensure upload and index directories exist + os.makedirs(os.getenv("BOOKRAG_UPLOAD_DIR", "./uploads"), exist_ok=True) + os.makedirs(os.getenv("BOOKRAG_INDEX_DIR", "./indices"), exist_ok=True) + + # Build MongoDB indexes and recover stale indexing for all known tenants + tenant_ids: list[str] = [] + try: + sdb = db.get_system_db(MONGO_URI, MONGO_SYSTEM_DB) + tenant_ids = [t["tenant_id"] async for t in sdb["tenants"].find({}, {"tenant_id": 1})] + await db.ensure_indexes(MONGO_URI, MONGO_SYSTEM_DB, MONGO_DB_PREFIX, tenant_ids) + except Exception as exc: + log.warning(f"MongoDB index creation skipped: {exc}") + + # Recover docs stuck in "indexing" status from a previous crash + if tenant_ids: + try: + recovered = await db.recover_stale_indexing(MONGO_URI, MONGO_DB_PREFIX, tenant_ids) + if recovered: + log.warning(f"Recovered {recovered} stale indexing document(s) → status='error'") + except Exception as exc: + log.warning(f"Stale indexing recovery skipped: {exc}") + + yield + + # Graceful shutdown: finish running tasks, don't cancel them + log.info("BookRAG API shutting down — draining thread pool...") + THREAD_POOL.shutdown(wait=True, cancel_futures=False) + await db.close_client() + log.info("BookRAG API shut down cleanly.") + + +app = FastAPI( + title="BookRAG API", + description="Multi-tenant, multi-document chatbot powered by GBC-RAG", + version="1.0.0", + lifespan=lifespan, +) + +# Request-ID middleware (must be added before CORS) +app.add_middleware(_RequestIDMiddleware) + +# CORS — set BOOKRAG_CORS_ORIGINS to a comma-separated list of allowed origins +_cors_raw = os.getenv("BOOKRAG_CORS_ORIGINS", "http://localhost:3000,http://localhost:8000") +_cors_origins = [o.strip() for o in _cors_raw.split(",") if o.strip()] +if "*" in _cors_origins: + log.warning( + "CORS allow_origins contains '*'. This is insecure with credentials=True. " + "Set BOOKRAG_CORS_ORIGINS to explicit origins in production." + ) + _cors_credentials = False +else: + _cors_credentials = True + +app.add_middleware( + CORSMiddleware, + allow_origins=_cors_origins, + allow_credentials=_cors_credentials, + allow_methods=["GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"], + allow_headers=["Authorization", "Content-Type", "X-Request-ID"], +) + +# Routers +app.include_router(auth.router) +app.include_router(tenants.router) +app.include_router(documents.router) +app.include_router(chat.router) +app.include_router(entities.router) + + +@app.get("/health") +async def health(): + """Deep health check — verify MongoDB and FalkorDB connectivity.""" + checks: dict = {"service": "BookRAG API"} + + # MongoDB ping + try: + client = db.get_client(MONGO_URI) + await client.admin.command("ping") + checks["mongodb"] = "ok" + except Exception as exc: + checks["mongodb"] = f"error: {exc}" + + # FalkorDB ping (optional — only if host is configured) + fdb_host = os.getenv("BOOKRAG_FALKORDB_HOST", "") + if fdb_host: + try: + from api.dependencies import FALKORDB_HOST, FALKORDB_PORT, FALKORDB_USERNAME, FALKORDB_PASSWORD + import falkordb + conn_kwargs = {"host": FALKORDB_HOST, "port": FALKORDB_PORT} + if FALKORDB_USERNAME: + conn_kwargs["username"] = FALKORDB_USERNAME + if FALKORDB_PASSWORD: + conn_kwargs["password"] = FALKORDB_PASSWORD + fdb = falkordb.FalkorDB(**conn_kwargs) + fdb.connection.ping() + checks["falkordb"] = "ok" + except Exception as exc: + checks["falkordb"] = f"error: {exc}" + + overall = "ok" if all(v == "ok" for k, v in checks.items() if k != "service") else "degraded" + checks["status"] = overall + return checks + diff --git a/api/models/__init__.py b/api/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/models/requests.py b/api/models/requests.py new file mode 100644 index 0000000..30abbd4 --- /dev/null +++ b/api/models/requests.py @@ -0,0 +1,205 @@ +"""Pydantic request and response models for the BookRAG API.""" +from datetime import datetime +from typing import List, Optional +from pydantic import BaseModel, Field, field_validator + +# ── Reusable length constraints ────────────────────────────────────────────── +_SHORT_STR = 128 # usernames, tenant_ids, role names +_PASSWORD_MIN = 8 +_PASSWORD_MAX = 128 +_QUERY_MAX = 10_000 # max characters for a chat query + + +# ── Auth ────────────────────────────────────────────────────────────────────── + +class RegisterRequest(BaseModel): + username: str = Field(..., min_length=3, max_length=_SHORT_STR) + password: str = Field(..., min_length=_PASSWORD_MIN, max_length=_PASSWORD_MAX) + tenant_id: str = Field(..., min_length=1, max_length=_SHORT_STR) + + @field_validator("password") + @classmethod + def password_complexity(cls, v: str) -> str: + if not any(c.isupper() for c in v): + raise ValueError("Password must contain at least one uppercase letter") + if not any(c.islower() for c in v): + raise ValueError("Password must contain at least one lowercase letter") + if not any(c.isdigit() for c in v): + raise ValueError("Password must contain at least one digit") + return v + + +class LoginRequest(BaseModel): + username: str = Field(..., min_length=1, max_length=_SHORT_STR) + password: str = Field(..., min_length=1, max_length=_PASSWORD_MAX) + tenant_id: str = Field(..., min_length=1, max_length=_SHORT_STR) + + +class TokenResponse(BaseModel): + access_token: str = Field(..., description="Short-lived JWT access token (default 60 min)") + refresh_token: str = Field(..., description="Long-lived refresh token for rotation (default 7 days)") + token_type: str = Field(default="bearer", description="OAuth2 token type") + + +class RefreshRequest(BaseModel): + refresh_token: str + + +# ── Tenant ──────────────────────────────────────────────────────────────────── + +class TenantCreateRequest(BaseModel): + tenant_id: str = Field(..., min_length=1, max_length=_SHORT_STR) + name: str = Field(..., min_length=1, max_length=256) + description: Optional[str] = Field(default="", max_length=1000) + + +class TenantResponse(BaseModel): + tenant_id: str + name: str + description: Optional[str] = "" + + +# ── Document ────────────────────────────────────────────────────────────────── + +class DocumentResponse(BaseModel): + doc_id: str = Field(..., description="Unique document identifier") + filename: str = Field(..., description="Original filename") + status: str = Field(..., description="Indexing status: pending | indexing | ready | error") + error: Optional[str] = Field(default=None, description="Error message if status is 'error'") + created_at: Optional[datetime] = Field(default=None, description="Upload timestamp (UTC)") + document_date: Optional[datetime] = Field( + default=None, + description="User-provided original authoring/publishing date of the document. " + "Used for temporal awareness in cross-document RAG.", + ) + document_lang: Optional[str] = Field( + default=None, + description="ISO 639-1 language code (e.g. 'en', 'id') or 'auto' for auto-detection. " + "Used for legal heading detection and language-aware text processing.", + ) + + +class BatchUploadResponse(BaseModel): + uploaded: List["DocumentResponse"] = Field(..., description="Successfully uploaded documents") + failed: List[dict] = Field(default_factory=list, description="Files that failed: [{filename, error}]") + + +class PermissionGrantRequest(BaseModel): + user_id: str = Field(..., min_length=1, max_length=_SHORT_STR) + doc_id: str = Field(..., min_length=1, max_length=_SHORT_STR) + role: str = Field(default="reader", max_length=32) + + +# ── Chat ────────────────────────────────────────────────────────────────────── + +class ChatQueryRequest(BaseModel): + query: str = Field(..., min_length=1, max_length=_QUERY_MAX, description="User question") + session_id: Optional[str] = Field(default=None, max_length=_SHORT_STR, description="Existing session ID for history-aware queries") + doc_ids: Optional[List[str]] = Field(default=None, description="Restrict to specific docs; None = all accessible") + cross_doc: bool = Field(default=False, description="Use cross-document retrieval mode") + + +class ChatQueryResponse(BaseModel): + answer: str = Field(..., description="LLM-generated answer") + session_id: str = Field(..., description="Session ID (created if not provided)") + doc_ids_used: List[str] = Field(default_factory=list, description="Document IDs used for retrieval") + rewritten_query: Optional[str] = Field(default=None, description="Rewritten query when history was used") + + +class SessionCreateRequest(BaseModel): + doc_ids: Optional[List[str]] = None + + +class SessionResponse(BaseModel): + session_id: str + + +class SessionListItem(BaseModel): + session_id: str + created_at: Optional[datetime] = None + message_count: int = 0 + doc_ids: List[str] = Field(default_factory=list) + + +class SessionListResponse(BaseModel): + sessions: List[SessionListItem] + total: int + + +class MessageResponse(BaseModel): + role: str # "user" | "assistant" + content: str + ts: Optional[str] = None + + +class SessionMessagesResponse(BaseModel): + session_id: str + messages: List[MessageResponse] + total: int = Field(0, description="Total messages in session (before pagination)") + + +# ── Entity Management ───────────────────────────────────────────────────────── + +class EntityRef(BaseModel): + entity_name: str + entity_type: str + + +class EntityInfo(BaseModel): + entity_name: str + entity_type: str + description: str + source_ids: List[int] + node_name: str + + +class EntityListResponse(BaseModel): + entities: List[EntityInfo] + total: int + + +class RenameEntityRequest(BaseModel): + entity_name: str + entity_type: str + new_entity_name: str + new_entity_type: str = "" # empty → keep same type + new_description: Optional[str] = None # None → keep existing description + + +class MergeEntitiesRequest(BaseModel): + source_entities: List[EntityRef] # entities to merge (≥ 2) + canonical_entity_name: str + canonical_entity_type: str + canonical_description: str = "" + + +class NewEntitySpec(BaseModel): + entity_name: str + entity_type: str + description: str = "" + source_ids: List[int] = Field(default_factory=list) + + +class SplitEntityRequest(BaseModel): + entity_name: str + entity_type: str + new_entities: List[NewEntitySpec] # ≥ 2 new entities + edge_mode: str = "duplicate" # "duplicate" | "none" + + +class MergeSuggestion(BaseModel): + entity_a: EntityRef + entity_b: EntityRef + score: float # 0.0 – 1.0 + method: str # "string_similarity" | "embedding_similarity" + + +class SuggestMergesResponse(BaseModel): + suggestions: List[MergeSuggestion] + + +class EntityOperationResponse(BaseModel): + success: bool + message: str + entities: List[EntityInfo] = Field(default_factory=list) + diff --git a/api/routers/__init__.py b/api/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/routers/auth.py b/api/routers/auth.py new file mode 100644 index 0000000..aad276a --- /dev/null +++ b/api/routers/auth.py @@ -0,0 +1,108 @@ +"""Authentication router: register, login, refresh, and logout endpoints.""" +import logging +from datetime import datetime, timedelta, timezone + +from fastapi import APIRouter, Depends, HTTPException, status + +from api.models.requests import RegisterRequest, LoginRequest, TokenResponse, RefreshRequest +from api.db import mongodb as db +from api.dependencies import ( + MONGO_URI, MONGO_DB_PREFIX, MONGO_SYSTEM_DB, + hash_password, verify_password, + create_access_token, create_refresh_token, decode_refresh_token, + REFRESH_TOKEN_EXPIRE_DAYS, + rate_limit_login, get_current_user, +) + +log = logging.getLogger(__name__) +router = APIRouter(prefix="/auth", tags=["auth"]) + + +@router.post("/register", status_code=status.HTTP_201_CREATED) +async def register(req: RegisterRequest): + """Register a new user within a tenant.""" + # Verify tenant exists + tenant = await db.get_tenant(MONGO_URI, MONGO_SYSTEM_DB, req.tenant_id) + if not tenant: + raise HTTPException(status_code=404, detail=f"Tenant '{req.tenant_id}' not found") + + # Check username not taken + existing = await db.get_user_by_username(MONGO_URI, MONGO_DB_PREFIX, req.tenant_id, req.username) + if existing: + raise HTTPException(status_code=409, detail="Username already registered") + + user_data = { + "username": req.username, + "hashed_password": hash_password(req.password), + "tenant_id": req.tenant_id, + "role": "user", + "user_id": req.username, # use username as user_id for simplicity + } + await db.create_user(MONGO_URI, MONGO_DB_PREFIX, req.tenant_id, user_data) + return {"message": "User registered successfully"} + + +@router.post("/login", response_model=TokenResponse, dependencies=[Depends(rate_limit_login)]) +async def login(req: LoginRequest): + """Authenticate and return JWT access + refresh tokens.""" + user = await db.get_user_by_username(MONGO_URI, MONGO_DB_PREFIX, req.tenant_id, req.username) + if not user or not verify_password(req.password, user["hashed_password"]): + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Incorrect username or password", + headers={"WWW-Authenticate": "Bearer"}, + ) + claims = { + "sub": user["user_id"], + "tenant_id": user["tenant_id"], + "role": user.get("role", "user"), + } + access = create_access_token(claims) + refresh = create_refresh_token(claims) + + # Store refresh token hash for revocation support + expires_at = datetime.now(timezone.utc) + timedelta(days=REFRESH_TOKEN_EXPIRE_DAYS) + await db.store_refresh_token( + MONGO_URI, MONGO_DB_PREFIX, req.tenant_id, + user["user_id"], refresh, expires_at, + ) + return TokenResponse(access_token=access, refresh_token=refresh) + + +@router.post("/refresh", response_model=TokenResponse) +async def refresh_token(req: RefreshRequest): + """Exchange a valid refresh token for a new access + refresh token pair. + + The old refresh token is revoked (single-use rotation). + """ + payload = decode_refresh_token(req.refresh_token) + tenant_id = payload.get("tenant_id", "") + user_id = payload.get("sub", "") + + # Check the token hasn't been revoked + if not await db.is_refresh_token_valid(MONGO_URI, MONGO_DB_PREFIX, tenant_id, req.refresh_token): + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Refresh token has been revoked", + ) + + # Revoke old refresh token (single-use rotation) + await db.revoke_refresh_token(MONGO_URI, MONGO_DB_PREFIX, tenant_id, req.refresh_token) + + # Issue new pair + claims = {"sub": user_id, "tenant_id": tenant_id, "role": payload.get("role", "user")} + new_access = create_access_token(claims) + new_refresh = create_refresh_token(claims) + + expires_at = datetime.now(timezone.utc) + timedelta(days=REFRESH_TOKEN_EXPIRE_DAYS) + await db.store_refresh_token(MONGO_URI, MONGO_DB_PREFIX, tenant_id, user_id, new_refresh, expires_at) + + return TokenResponse(access_token=new_access, refresh_token=new_refresh) + + +@router.post("/logout", status_code=status.HTTP_204_NO_CONTENT) +async def logout(req: RefreshRequest, current_user: dict = Depends(get_current_user)): + """Revoke the provided refresh token (logout).""" + tenant_id = current_user["tenant_id"] + await db.revoke_refresh_token(MONGO_URI, MONGO_DB_PREFIX, tenant_id, req.refresh_token) + diff --git a/api/routers/chat.py b/api/routers/chat.py new file mode 100644 index 0000000..24d56ab --- /dev/null +++ b/api/routers/chat.py @@ -0,0 +1,130 @@ +"""Chat router: query, session management.""" +import logging +import os +from typing import List +from fastapi import APIRouter, Depends, HTTPException, Query + +from api.models.requests import ( + ChatQueryRequest, ChatQueryResponse, + SessionCreateRequest, SessionResponse, + SessionMessagesResponse, MessageResponse, + SessionListResponse, SessionListItem, +) +from api.db import mongodb as db +from api.dependencies import ( + MONGO_URI, MONGO_DB_PREFIX, + get_current_user, filter_accessible_docs, + rate_limit_query, +) +from api.services.chat import handle_query + +log = logging.getLogger(__name__) +router = APIRouter(prefix="/chat", tags=["chat"]) + +CONFIG_PATH = os.getenv("BOOKRAG_CONFIG_PATH", "config/gbc.yaml") + + +@router.post("/query", response_model=ChatQueryResponse) +async def query(req: ChatQueryRequest, current_user: dict = Depends(rate_limit_query)): + """Submit a query. Automatically filters to accessible documents.""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + + accessible_docs = await filter_accessible_docs(user_id, tenant_id, req.doc_ids) + if not accessible_docs: + raise HTTPException(status_code=403, detail="No accessible documents for this query") + + result = await handle_query( + query=req.query, + tenant_id=tenant_id, + user_id=user_id, + doc_ids=accessible_docs, + session_id=req.session_id, + config_path=CONFIG_PATH, + cross_doc=req.cross_doc, + ) + return ChatQueryResponse(**result) + + +@router.post("/sessions", response_model=SessionResponse, status_code=201) +async def create_session(req: SessionCreateRequest, current_user: dict = Depends(get_current_user)): + """Create a new chat session.""" + import uuid + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + session_id = str(uuid.uuid4()) + accessible_docs = await filter_accessible_docs(user_id, tenant_id, req.doc_ids) + await db.create_session(MONGO_URI, MONGO_DB_PREFIX, tenant_id, { + "session_id": session_id, + "user_id": user_id, + "doc_ids": accessible_docs, + "messages": [], + }) + return SessionResponse(session_id=session_id) + + +@router.get("/sessions", response_model=SessionListResponse) +async def list_sessions( + limit: int = Query(default=50, ge=1, le=200, description="Max sessions to return"), + offset: int = Query(default=0, ge=0, description="Number of sessions to skip"), + current_user: dict = Depends(get_current_user), +): + """List all chat sessions for the current user, newest first.""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + sessions, total = await db.list_sessions( + MONGO_URI, MONGO_DB_PREFIX, tenant_id, user_id, limit=limit, offset=offset + ) + items = [ + SessionListItem( + session_id=s["session_id"], + created_at=s.get("created_at"), + message_count=s.get("message_count", len(s.get("messages", []))), + doc_ids=s.get("doc_ids", []), + ) + for s in sessions + ] + return SessionListResponse(sessions=items, total=total) + + +@router.delete("/sessions/{session_id}", status_code=204) +async def delete_session(session_id: str, current_user: dict = Depends(get_current_user)): + """Delete a chat session and all its messages.""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + session = await db.get_session(MONGO_URI, MONGO_DB_PREFIX, tenant_id, session_id) + if not session: + raise HTTPException(status_code=404, detail="Session not found") + if session.get("user_id") != user_id and current_user["role"] != "admin": + raise HTTPException(status_code=403, detail="Access denied") + await db.delete_session(MONGO_URI, MONGO_DB_PREFIX, tenant_id, session_id) + + +@router.get("/sessions/{session_id}/messages", response_model=SessionMessagesResponse) +async def get_messages( + session_id: str, + limit: int = Query(default=100, ge=1, le=500, description="Max messages to return"), + offset: int = Query(default=0, ge=0, description="Number of messages to skip"), + current_user: dict = Depends(get_current_user), +): + """Retrieve messages in a session with pagination.""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + session = await db.get_session(MONGO_URI, MONGO_DB_PREFIX, tenant_id, session_id) + if not session: + raise HTTPException(status_code=404, detail="Session not found") + if session.get("user_id") != user_id and current_user["role"] != "admin": + raise HTTPException(status_code=403, detail="Access denied") + all_messages = session.get("messages", []) + total = len(all_messages) + paginated = all_messages[offset:offset + limit] + messages = [ + MessageResponse( + role=m["role"], + content=m["content"], + ts=str(m.get("ts", "")), + ) + for m in paginated + ] + return SessionMessagesResponse(session_id=session_id, messages=messages, total=total) + diff --git a/api/routers/documents.py b/api/routers/documents.py new file mode 100644 index 0000000..e3e5981 --- /dev/null +++ b/api/routers/documents.py @@ -0,0 +1,287 @@ +"""Document management router: upload (multi-file), list, status, raw download, delete.""" +import logging +import os +import shutil +import uuid +from datetime import datetime +from typing import List, Optional + +from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, BackgroundTasks, Query, Form +from fastapi.responses import FileResponse +import aiofiles + +from api.models.requests import DocumentResponse, BatchUploadResponse +from api.db import mongodb as db +from api.dependencies import ( + MONGO_URI, MONGO_DB_PREFIX, UPLOAD_DIR, INDEX_SAVE_DIR, + get_current_user, check_doc_access, +) +from api.services.indexing import run_indexing + +log = logging.getLogger(__name__) +router = APIRouter(prefix="/documents", tags=["documents"]) + +CONFIG_PATH = os.getenv("BOOKRAG_CONFIG_PATH", "config/gbc.yaml") + +# Max upload size in bytes — default 200 MB +_MAX_UPLOAD_BYTES = int(os.getenv("BOOKRAG_MAX_UPLOAD_MB", "200")) * 1024 * 1024 + + +async def _save_and_register_file( + file: UploadFile, + tenant_id: str, + user_id: str, + tenant_upload_dir: str, +) -> dict: + """Save one uploaded file to the tenant upload dir and return doc metadata. + + Raises ``HTTPException(413)`` if the file exceeds ``_MAX_UPLOAD_BYTES``. + """ + doc_id = str(uuid.uuid4()) + # Preserve original filename; prefix with doc_id to avoid collisions + safe_name = os.path.basename(file.filename) + pdf_path = os.path.join(tenant_upload_dir, f"{doc_id}_{safe_name}") + + total_size = 0 + async with aiofiles.open(pdf_path, "wb") as out: + while chunk := await file.read(1024 * 1024): # 1 MB chunks + total_size += len(chunk) + if total_size > _MAX_UPLOAD_BYTES: + # Clean up partial file + await out.close() + try: + os.remove(pdf_path) + except OSError: + pass + raise HTTPException( + status_code=413, + detail=f"File exceeds maximum upload size of {_MAX_UPLOAD_BYTES // (1024*1024)} MB", + ) + await out.write(chunk) + + from datetime import timezone + now = datetime.now(timezone.utc) + return { + "doc_id": doc_id, + "filename": file.filename, + "tenant_id": tenant_id, + "uploaded_by": user_id, + "pdf_path": pdf_path, + "created_at": now, + "status": "pending", + } + + +@router.post("", status_code=202, response_model=BatchUploadResponse) +async def upload_documents( + background_tasks: BackgroundTasks, + files: List[UploadFile] = File(...), + document_date: Optional[str] = Form( + default=None, + description="Optional ISO-8601 date for ALL uploaded files (original authoring date). " + "Example: 2025-06-15 or 2025-06-15T10:30:00Z", + ), + document_lang: Optional[str] = Form( + default=None, + description="Optional ISO 639-1 language code (e.g. 'en', 'id') for ALL uploaded files. " + "Omit or set to 'auto' for automatic detection from extracted text.", + ), + current_user: dict = Depends(get_current_user), +): + """Upload one or more PDFs and start background indexing for each. + + An optional ``document_date`` (ISO-8601) can be provided to indicate + the original authoring/publishing date of the documents. This date is + used for temporal-awareness in cross-document RAG queries. + + An optional ``document_lang`` (ISO 639-1 code like ``en``, ``id``) can + be provided to hint the document language for legal heading detection + and text processing. Omit for automatic detection. + """ + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + + # Parse optional document_date + parsed_doc_date: Optional[datetime] = None + if document_date: + try: + parsed_doc_date = datetime.fromisoformat(document_date.replace("Z", "+00:00")) + except ValueError: + raise HTTPException(status_code=422, detail="document_date must be a valid ISO-8601 date string") + + tenant_upload_dir = os.path.join(UPLOAD_DIR, tenant_id) + os.makedirs(tenant_upload_dir, exist_ok=True) + + uploaded: List[DocumentResponse] = [] + failed: List[dict] = [] + + for file in files: + if not file.filename.lower().endswith(".pdf"): + failed.append({"filename": file.filename, "error": "Only PDF files are supported"}) + continue + try: + doc_data = await _save_and_register_file(file, tenant_id, user_id, tenant_upload_dir) + except Exception as exc: + log.error(f"Failed to save file '{file.filename}': {exc}") + failed.append({"filename": file.filename, "error": str(exc)}) + continue + + # Attach document_date if provided + if parsed_doc_date: + doc_data["document_date"] = parsed_doc_date + + # Attach document_lang if provided + if document_lang: + doc_data["document_lang"] = document_lang + + # Register document in MongoDB + await db.create_document(MONGO_URI, MONGO_DB_PREFIX, tenant_id, doc_data) + # Auto-grant uploader owner access + await db.grant_permission(MONGO_URI, MONGO_DB_PREFIX, tenant_id, user_id, doc_data["doc_id"], "owner") + # Enqueue background indexing + background_tasks.add_task( + run_indexing, tenant_id, doc_data["doc_id"], doc_data["pdf_path"], CONFIG_PATH, + document_date=parsed_doc_date, + document_lang=document_lang, + ) + uploaded.append(DocumentResponse( + doc_id=doc_data["doc_id"], filename=file.filename, status="pending", + document_date=parsed_doc_date, + document_lang=document_lang, + )) + + return BatchUploadResponse(uploaded=uploaded, failed=failed) + + +@router.get("", response_model=List[DocumentResponse]) +async def list_documents( + limit: int = Query(default=50, ge=1, le=200, description="Max documents to return"), + offset: int = Query(default=0, ge=0, description="Number of documents to skip"), + current_user: dict = Depends(get_current_user), +): + """List documents accessible to the current user, sorted by document_date descending.""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + docs, _total = await db.list_documents(MONGO_URI, MONGO_DB_PREFIX, tenant_id, user_id, limit=limit, offset=offset) + return [ + DocumentResponse( + doc_id=d["doc_id"], + filename=d.get("filename", ""), + status=d.get("status", "unknown"), + error=d.get("error"), + created_at=d.get("created_at"), + document_date=d.get("document_date"), + ) + for d in docs + ] + + +@router.get("/{doc_id}", response_model=DocumentResponse) +async def get_document_status(doc_id: str, current_user: dict = Depends(get_current_user)): + """Get indexing status for a specific document.""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + + if not await check_doc_access(user_id, tenant_id, doc_id): + raise HTTPException(status_code=403, detail="Access denied to this document") + + doc = await db.get_document(MONGO_URI, MONGO_DB_PREFIX, tenant_id, doc_id) + if not doc: + raise HTTPException(status_code=404, detail="Document not found") + return DocumentResponse( + doc_id=doc["doc_id"], + filename=doc.get("filename", ""), + status=doc.get("status", "unknown"), + error=doc.get("error"), + created_at=doc.get("created_at"), + document_date=doc.get("document_date"), + ) + + +@router.delete("/{doc_id}", status_code=204) +async def delete_document(doc_id: str, current_user: dict = Depends(get_current_user)): + """Delete a document and all associated indexes, VDB data, and FalkorDB graph. + + Requires the requesting user to be the document owner or an admin. + """ + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + + # Only owner or admin can delete + if current_user["role"] != "admin": + perm = await db.get_permission(MONGO_URI, MONGO_DB_PREFIX, tenant_id, user_id, doc_id) + if not perm or perm.get("role") != "owner": + raise HTTPException(status_code=403, detail="Only document owners or admins can delete documents") + + doc = await db.get_document(MONGO_URI, MONGO_DB_PREFIX, tenant_id, doc_id) + if not doc: + raise HTTPException(status_code=404, detail="Document not found") + + # Clean up filesystem: uploaded PDF + pdf_path = doc.get("pdf_path", "") + if pdf_path and os.path.isfile(pdf_path): + try: + os.remove(pdf_path) + except OSError: + log.warning(f"Could not remove uploaded PDF: {pdf_path}") + + # Clean up filesystem: index directory + index_dir = os.path.join(INDEX_SAVE_DIR, tenant_id, doc_id) + if os.path.isdir(index_dir): + try: + shutil.rmtree(index_dir) + except OSError: + log.warning(f"Could not remove index directory: {index_dir}") + + # Clean up FalkorDB graph (best-effort) + try: + from api.dependencies import FALKORDB_HOST, FALKORDB_PORT, FALKORDB_USERNAME, FALKORDB_PASSWORD + if os.getenv("BOOKRAG_FALKORDB_HOST", ""): + import falkordb + conn_kwargs = {"host": FALKORDB_HOST, "port": FALKORDB_PORT} + if FALKORDB_USERNAME: + conn_kwargs["username"] = FALKORDB_USERNAME + if FALKORDB_PASSWORD: + conn_kwargs["password"] = FALKORDB_PASSWORD + fdb = falkordb.FalkorDB(**conn_kwargs) + graph_name = f"bookrag:{tenant_id}:doc:{doc_id}" + try: + g = fdb.select_graph(graph_name) + g.delete() + log.info(f"Deleted FalkorDB graph '{graph_name}'") + except Exception: + pass # Graph may not exist + except Exception as exc: + log.warning(f"FalkorDB cleanup skipped: {exc}") + + # Clean up MongoDB records + await db.delete_document(MONGO_URI, MONGO_DB_PREFIX, tenant_id, doc_id) + + +@router.get("/{doc_id}/raw") +async def download_raw_document(doc_id: str, current_user: dict = Depends(get_current_user)): + """Stream back the original uploaded PDF file.""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + + if not await check_doc_access(user_id, tenant_id, doc_id): + raise HTTPException(status_code=403, detail="Access denied to this document") + + raw_path = await db.get_document_raw_path(MONGO_URI, MONGO_DB_PREFIX, tenant_id, doc_id) + if not raw_path or not os.path.isfile(raw_path): + raise HTTPException(status_code=404, detail="Raw document file not found") + + # Prevent path-traversal: resolved path must be inside UPLOAD_DIR + resolved = os.path.realpath(raw_path) + upload_root = os.path.realpath(UPLOAD_DIR) + if not resolved.startswith(upload_root + os.sep) and resolved != upload_root: + log.warning(f"Path traversal blocked: {raw_path} resolved to {resolved}") + raise HTTPException(status_code=403, detail="Access denied") + + filename = os.path.basename(raw_path) + return FileResponse( + path=raw_path, + media_type="application/pdf", + filename=filename, + ) + diff --git a/api/routers/entities.py b/api/routers/entities.py new file mode 100644 index 0000000..2b6b7df --- /dev/null +++ b/api/routers/entities.py @@ -0,0 +1,198 @@ +"""Entity management router: list, rename, merge, split, suggest-merges.""" +import logging +import os + +from fastapi import APIRouter, Depends, HTTPException + +from api.models.requests import ( + EntityListResponse, EntityInfo, + EntityOperationResponse, + RenameEntityRequest, + MergeEntitiesRequest, + SplitEntityRequest, + SuggestMergesResponse, MergeSuggestion, EntityRef, +) +from api.dependencies import get_current_user, check_doc_access +import api.services.entity_editor as svc + +log = logging.getLogger(__name__) +router = APIRouter(prefix="/entities", tags=["entities"]) + +CONFIG_PATH = os.getenv("BOOKRAG_CONFIG_PATH", "config/gbc.yaml") + + +async def _require_access(tenant_id: str, user_id: str, doc_id: str): + if not await check_doc_access(user_id, tenant_id, doc_id): + raise HTTPException(status_code=403, detail="Access denied to this document") + + +# ── List ────────────────────────────────────────────────────────────────────── + +@router.get("/{doc_id}", response_model=EntityListResponse) +async def list_entities(doc_id: str, current_user: dict = Depends(get_current_user)): + """Return all NER entities for the given document.""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + await _require_access(tenant_id, user_id, doc_id) + + try: + entities = await svc.list_entities(tenant_id, doc_id, CONFIG_PATH) + except Exception as exc: + log.exception(f"list_entities failed: {exc}") + raise HTTPException(status_code=500, detail="Internal server error") + + return EntityListResponse( + entities=[EntityInfo(**e) for e in entities], + total=len(entities), + ) + + +# ── Rename ──────────────────────────────────────────────────────────────────── + +@router.patch("/{doc_id}/rename", response_model=EntityOperationResponse) +async def rename_entity( + doc_id: str, + req: RenameEntityRequest, + current_user: dict = Depends(get_current_user), +): + """Rename an entity node (name and/or type).""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + await _require_access(tenant_id, user_id, doc_id) + + try: + updated = await svc.rename_entity( + tenant_id=tenant_id, doc_id=doc_id, config_path=CONFIG_PATH, + entity_name=req.entity_name, entity_type=req.entity_type, + new_entity_name=req.new_entity_name, + new_entity_type=req.new_entity_type, + new_description=req.new_description, + user_id=user_id, + ) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) + except Exception as exc: + log.exception(f"rename_entity failed: {exc}") + raise HTTPException(status_code=500, detail="Internal server error") + + return EntityOperationResponse( + success=True, + message=f"Renamed '{req.entity_name}' → '{req.new_entity_name}'", + entities=[EntityInfo(**e) for e in updated], + ) + + +# ── Merge ───────────────────────────────────────────────────────────────────── + +@router.post("/{doc_id}/merge", response_model=EntityOperationResponse) +async def merge_entities( + doc_id: str, + req: MergeEntitiesRequest, + current_user: dict = Depends(get_current_user), +): + """Merge two or more entities into a single canonical entity.""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + await _require_access(tenant_id, user_id, doc_id) + + if len(req.source_entities) < 2: + raise HTTPException(status_code=422, detail="Provide at least 2 source_entities to merge") + + try: + updated = await svc.merge_entities( + tenant_id=tenant_id, doc_id=doc_id, config_path=CONFIG_PATH, + source_entities=[e.model_dump() for e in req.source_entities], + canonical_name=req.canonical_entity_name, + canonical_type=req.canonical_entity_type, + canonical_desc=req.canonical_description, + user_id=user_id, + ) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) + except Exception as exc: + log.exception(f"merge_entities failed: {exc}") + raise HTTPException(status_code=500, detail="Internal server error") + + return EntityOperationResponse( + success=True, + message=f"Merged {len(req.source_entities)} entities → '{req.canonical_entity_name}'", + entities=[EntityInfo(**e) for e in updated], + ) + + +# ── Split ───────────────────────────────────────────────────────────────────── + +@router.post("/{doc_id}/split", response_model=EntityOperationResponse) +async def split_entity( + doc_id: str, + req: SplitEntityRequest, + current_user: dict = Depends(get_current_user), +): + """Split one entity into two or more new entities.""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + await _require_access(tenant_id, user_id, doc_id) + + if len(req.new_entities) < 2: + raise HTTPException(status_code=422, detail="Provide at least 2 new_entities for a split") + if req.edge_mode not in ("duplicate", "none"): + raise HTTPException(status_code=422, detail="edge_mode must be 'duplicate' or 'none'") + + try: + created = await svc.split_entity( + tenant_id=tenant_id, doc_id=doc_id, config_path=CONFIG_PATH, + entity_name=req.entity_name, entity_type=req.entity_type, + new_entities=[e.model_dump() for e in req.new_entities], + edge_mode=req.edge_mode, + user_id=user_id, + ) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) + except Exception as exc: + log.exception(f"split_entity failed: {exc}") + raise HTTPException(status_code=500, detail="Internal server error") + + return EntityOperationResponse( + success=True, + message=f"Split '{req.entity_name}' into {len(created)} entities", + entities=[EntityInfo(**e) for e in created], + ) + + +# ── Suggest merges ──────────────────────────────────────────────────────────── + +@router.get("/{doc_id}/suggestions", response_model=SuggestMergesResponse) +async def suggest_merges( + doc_id: str, + min_score: float = 0.80, + top_k: int = 50, + use_embeddings: bool = False, + current_user: dict = Depends(get_current_user), +): + """Return ranked merge-candidate pairs based on string/embedding similarity.""" + tenant_id = current_user["tenant_id"] + user_id = current_user["user_id"] + await _require_access(tenant_id, user_id, doc_id) + + if not (0.0 <= min_score <= 1.0): + raise HTTPException(status_code=422, detail="min_score must be between 0.0 and 1.0") + + try: + raw = await svc.suggest_merges( + tenant_id=tenant_id, doc_id=doc_id, config_path=CONFIG_PATH, + min_score=min_score, top_k=top_k, use_embeddings=use_embeddings, + ) + except Exception as exc: + log.exception(f"suggest_merges failed: {exc}") + raise HTTPException(status_code=500, detail="Internal server error") + + return SuggestMergesResponse(suggestions=[ + MergeSuggestion( + entity_a=EntityRef(**s["entity_a"]), + entity_b=EntityRef(**s["entity_b"]), + score=s["score"], + method=s["method"], + ) + for s in raw + ]) + diff --git a/api/routers/tenants.py b/api/routers/tenants.py new file mode 100644 index 0000000..a9d044f --- /dev/null +++ b/api/routers/tenants.py @@ -0,0 +1,73 @@ +"""Tenant management router (admin only).""" +import logging +from fastapi import APIRouter, Depends, HTTPException + +from api.models.requests import TenantCreateRequest, TenantResponse, PermissionGrantRequest +from api.db import mongodb as db +from api.dependencies import ( + MONGO_URI, MONGO_DB_PREFIX, MONGO_SYSTEM_DB, + get_current_user, require_admin, +) + +log = logging.getLogger(__name__) +router = APIRouter(prefix="/tenants", tags=["tenants"]) + + +@router.post("", status_code=201) +async def create_tenant(req: TenantCreateRequest, _admin=Depends(require_admin)): + """Create a new tenant (admin only).""" + existing = await db.get_tenant(MONGO_URI, MONGO_SYSTEM_DB, req.tenant_id) + if existing: + raise HTTPException(status_code=409, detail="Tenant already exists") + await db.create_tenant(MONGO_URI, MONGO_SYSTEM_DB, req.model_dump()) + return {"message": f"Tenant '{req.tenant_id}' created"} + + +@router.get("/{tenant_id}", response_model=TenantResponse) +async def get_tenant(tenant_id: str, current_user=Depends(get_current_user)): + """Retrieve tenant info. Users can only see their own tenant.""" + if current_user["role"] != "admin" and current_user["tenant_id"] != tenant_id: + raise HTTPException(status_code=403, detail="Access denied") + tenant = await db.get_tenant(MONGO_URI, MONGO_SYSTEM_DB, tenant_id) + if not tenant: + raise HTTPException(status_code=404, detail="Tenant not found") + return TenantResponse( + tenant_id=tenant["tenant_id"], + name=tenant.get("name", ""), + description=tenant.get("description", ""), + ) + + +@router.post("/{tenant_id}/permissions", status_code=201) +async def grant_permission( + tenant_id: str, + req: PermissionGrantRequest, + current_user=Depends(get_current_user), +): + """Grant a user access to a document within a tenant. + + Requires the requesting user to be either: + - a global admin, OR + - the document owner (role='owner' on the target document) + """ + if current_user["tenant_id"] != tenant_id and current_user["role"] != "admin": + raise HTTPException(status_code=403, detail="Access denied") + + # Non-admin users must have 'owner' role on the document to grant permissions + if current_user["role"] != "admin": + perm = await db.get_permission( + MONGO_URI, MONGO_DB_PREFIX, tenant_id, + current_user["user_id"], req.doc_id, + ) + if not perm or perm.get("role") != "owner": + raise HTTPException( + status_code=403, + detail="Only document owners or admins can grant permissions", + ) + + await db.grant_permission( + MONGO_URI, MONGO_DB_PREFIX, tenant_id, + req.user_id, req.doc_id, req.role, + ) + return {"message": f"Permission granted: {req.user_id} → {req.doc_id} ({req.role})"} + diff --git a/api/services/__init__.py b/api/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/services/chat.py b/api/services/chat.py new file mode 100644 index 0000000..9f85dce --- /dev/null +++ b/api/services/chat.py @@ -0,0 +1,349 @@ +"""Chat service: single-doc and cross-doc query routing.""" +import asyncio +import logging +import os +import threading +import time +import uuid +from functools import lru_cache +from typing import List, Optional + +from api.db import mongodb as db +from api.dependencies import ( + MONGO_URI, MONGO_DB_PREFIX, INDEX_SAVE_DIR, + FALKORDB_HOST, FALKORDB_PORT, FALKORDB_USERNAME, FALKORDB_PASSWORD, + THREAD_POOL, +) + +log = logging.getLogger(__name__) +_executor = THREAD_POOL + +# ── Object caching ─────────────────────────────────────────────────────────── +# LLM/VLM are API client wrappers — one instance per config is sufficient. +# GBC indexes are heavier (tree + graph + VDB) — cached per document with TTL. + +_GBC_CACHE_TTL = int(os.getenv("BOOKRAG_GBC_CACHE_TTL", "600")) # seconds +_GBC_CACHE_MAX = int(os.getenv("BOOKRAG_GBC_CACHE_MAX", "20")) # max cached indexes + + +@lru_cache(maxsize=4) +def _get_system_config(config_path: str): + """Cached system config loader — reloads only when config_path changes.""" + from Core.configs.system_config import load_system_config + return load_system_config(config_path) + + +@lru_cache(maxsize=4) +def _get_llm(config_path: str): + """Singleton LLM instance per config file.""" + from Core.provider.llm import LLM + cfg = _get_system_config(config_path) + return LLM(cfg.llm) + + +@lru_cache(maxsize=4) +def _get_vlm(config_path: str): + """Singleton VLM instance per config file.""" + from Core.provider.vlm import VLM + cfg = _get_system_config(config_path) + if hasattr(cfg, "vlm") and cfg.vlm: + return VLM(cfg.vlm) + return None + + +class _GBCCache: + """TTL-bounded LRU cache for per-document GBC indexes.""" + + def __init__(self, max_size: int = 20, ttl: int = 600): + self._lock = threading.Lock() + self._cache: dict[str, tuple[float, object]] = {} # key → (access_time, gbc) + self._max_size = max_size + self._ttl = ttl + + def get(self, key: str): + with self._lock: + entry = self._cache.get(key) + if entry is None: + return None + ts, gbc = entry + if time.monotonic() - ts > self._ttl: + del self._cache[key] + return None + self._cache[key] = (time.monotonic(), gbc) + return gbc + + def put(self, key: str, gbc): + with self._lock: + self._cache[key] = (time.monotonic(), gbc) + # Evict oldest if over capacity + if len(self._cache) > self._max_size: + oldest_key = min(self._cache, key=lambda k: self._cache[k][0]) + del self._cache[oldest_key] + + def invalidate(self, key: str): + with self._lock: + self._cache.pop(key, None) + + +_gbc_cache = _GBCCache(max_size=_GBC_CACHE_MAX, ttl=_GBC_CACHE_TTL) + + +def _get_gbc_index(tenant_id: str, doc_id: str, config_path: str): + """Load or return cached GBC index for a specific document.""" + from Core.configs.falkordb_config import FalkorDBConfig + from Core.Index.GBCIndex import GBC + + cache_key = f"{tenant_id}:{doc_id}" + cached = _gbc_cache.get(cache_key) + if cached is not None: + log.debug(f"GBC cache hit: {cache_key}") + return cached + + log.info(f"GBC cache miss: {cache_key} — loading from disk") + cfg = _get_system_config(config_path) + # Create a copy-like config with tenant/doc specifics + cfg.tenant_id = tenant_id + cfg.doc_id = doc_id + cfg.save_path = os.path.join(INDEX_SAVE_DIR, tenant_id, doc_id) + + fdb_host = os.getenv("BOOKRAG_FALKORDB_HOST", "") + if fdb_host: + cfg.falkordb = FalkorDBConfig( + host=FALKORDB_HOST, port=FALKORDB_PORT, + username=FALKORDB_USERNAME, password=FALKORDB_PASSWORD, + ) + + gbc_index = GBC.load_gbc_index(cfg) + _gbc_cache.put(cache_key, gbc_index) + return gbc_index + +# ── History relevance constants (tunable via env) ───────────────────────────── +_RECENT_TURNS = int(os.getenv("BOOKRAG_RECENT_TURNS", "3")) # always-include last N pairs +_MAX_OLD_MSGS = int(os.getenv("BOOKRAG_MAX_OLD_MSGS", "4")) # max older msgs to keep +_JACCARD_THRESH = float(os.getenv("BOOKRAG_JACCARD_THRESH", "0.15")) # token-overlap threshold + + +# ── History filtering helpers ───────────────────────────────────────────────── + +def _jaccard_similarity(a: str, b: str) -> float: + """Token-overlap Jaccard similarity — zero-dependency relevance heuristic.""" + tokens_a = set(a.lower().split()) + tokens_b = set(b.lower().split()) + if not tokens_a or not tokens_b: + return 0.0 + return len(tokens_a & tokens_b) / len(tokens_a | tokens_b) + + +def _filter_relevant_history( + query: str, + messages: List[dict], + recent_turns: int = _RECENT_TURNS, + max_old: int = _MAX_OLD_MSGS, + threshold: float = _JACCARD_THRESH, +) -> List[dict]: + """Return the subset of prior messages relevant to *query*. + + Two-tier strategy: + 1. **Recency** — always keep the last ``recent_turns`` user+assistant pairs. + 2. **Relevance** — for older messages, keep those whose content has + Jaccard token-overlap >= *threshold* with the query (capped at *max_old*). + """ + if not messages: + return [] + + recent_cutoff = recent_turns * 2 # 2 messages per turn (user + assistant) + recent = messages[-recent_cutoff:] + older = messages[:-recent_cutoff] if len(messages) > recent_cutoff else [] + + relevant_older = [ + m for m in older + if _jaccard_similarity(query, m.get("content", "")) >= threshold + ][-max_old:] + + return relevant_older + recent + + +# ── Query rewriting (sync, runs inside thread pool) ─────────────────────────── + +def _rewrite_query_sync(query: str, history: List[dict], config_path: str) -> str: + """Use the LLM to rewrite *query* as a self-contained question. + + Resolves pronouns and implicit references using the conversation history. + If the LLM fails or the result is empty, the original query is returned unchanged. + """ + history_text = "\n".join( + f"{m['role'].capitalize()}: {m['content']}" for m in history + ) + prompt = ( + "You are a query rewriter for a document question-answering system.\n" + "Given the conversation history below and the user's latest question, " + "rewrite the question as a single, self-contained question that can be " + "understood without any prior context. Resolve all pronouns, coreferences, " + "and implicit references to named entities mentioned earlier in the conversation. " + "If the question is already fully self-contained, return it unchanged.\n\n" + f"Conversation history:\n{history_text}\n\n" + f"Latest question: {query}\n\n" + "Rewritten question (return ONLY the rewritten question, no preamble or explanation):" + ) + try: + llm = _get_llm(config_path) + rewritten = llm.get_completion(prompt).strip() + if rewritten: + log.info(f"Query rewritten: '{query}' → '{rewritten}'") + return rewritten + except Exception as exc: + log.warning(f"Query rewrite failed ({exc}); using original query.") + return query + + +def _query_single_doc_sync( + query: str, tenant_id: str, doc_id: str, config_path: str, lang: str = "en" +) -> str: + """Run GBC RAG query against a single document (sync, for thread pool). + + *query* should already be a self-contained, rewritten query when conversation + history is present (see :func:`_rewrite_query_sync`). + """ + from Core.rag.gbc_rag import GBCRAG + from Core.configs.rag.gbc_config import GBCRAGConfig + + gbc_index = _get_gbc_index(tenant_id, doc_id, config_path) + llm = _get_llm(config_path) + vlm = _get_vlm(config_path) + rag_cfg = GBCRAGConfig() + rag = GBCRAG(llm=llm, vlm=vlm, config=rag_cfg, gbc_index=gbc_index, lang=lang) + result = rag.get_GBC_info(query) + return result if isinstance(result, str) else str(result) + + +async def handle_query( + query: str, + tenant_id: str, + user_id: str, + doc_ids: List[str], + session_id: Optional[str], + config_path: str, + cross_doc: bool = False, +) -> dict: + """Route query to appropriate retrieval mode and store in session. + + Memory strategy + --------------- + 1. Load existing session messages from MongoDB (before appending the new one). + 2. Filter to relevant history using :func:`_filter_relevant_history` + (two-tier: recency + Jaccard token-overlap). + 3. If non-empty history, rewrite *query* into a self-contained standalone + question via the LLM (:func:`_rewrite_query_sync`). + 4. Pass the (possibly rewritten) query to the RAG pipeline. + """ + loop = asyncio.get_event_loop() + + # ── Session bootstrap ────────────────────────────────────────────────────── + if not session_id: + session_id = str(uuid.uuid4()) + await db.create_session(MONGO_URI, MONGO_DB_PREFIX, tenant_id, { + "session_id": session_id, + "user_id": user_id, + "doc_ids": doc_ids, + "messages": [], + }) + prior_messages: List[dict] = [] + else: + # Load history BEFORE appending the current user message + session = await db.get_session(MONGO_URI, MONGO_DB_PREFIX, tenant_id, session_id) + prior_messages = session.get("messages", []) if session else [] + + # ── History filtering + query rewriting ─────────────────────────────────── + relevant_history = _filter_relevant_history(query, prior_messages) + if relevant_history: + log.debug( + f"Session {session_id}: {len(relevant_history)} relevant history messages " + f"out of {len(prior_messages)} total — rewriting query." + ) + effective_query = await loop.run_in_executor( + _executor, _rewrite_query_sync, query, relevant_history, config_path + ) + else: + effective_query = query + + # ── Persist user message (original, for readability) ────────────────────── + await db.append_message(MONGO_URI, MONGO_DB_PREFIX, tenant_id, session_id, + {"role": "user", "content": query}) + + # ── RAG retrieval ───────────────────────────────────────────────────────── + + # ── Fetch per-doc metadata (dates + languages) — best-effort ────────── + doc_dates: dict[str, str] = {} + doc_langs: dict[str, str] = {} + try: + for did in doc_ids[:5]: + doc_record = await db.get_document(MONGO_URI, MONGO_DB_PREFIX, tenant_id, did) + if doc_record: + ddate = doc_record.get("document_date") or doc_record.get("created_at") + if ddate: + doc_dates[did] = str(ddate)[:10] # YYYY-MM-DD + dlang = doc_record.get("document_lang") + if dlang and dlang != "auto": + doc_langs[did] = dlang + except Exception: + pass # Non-fatal: metadata is best-effort + + if cross_doc or len(doc_ids) > 1: + # Parallel per-doc queries, answers synthesised into one response + target_docs = doc_ids[:5] # cap to avoid GPU overload + answers = await asyncio.gather(*[ + loop.run_in_executor( + _executor, _query_single_doc_sync, + effective_query, tenant_id, did, config_path, + doc_langs.get(did, "en"), + ) + for did in target_docs + ]) + + # Build answer with temporal + language context + parts = [] + for did, ans in zip(target_docs, answers): + date_str = f" (dated {doc_dates[did]})" if did in doc_dates else "" + lang_str = f" [lang: {doc_langs[did]}]" if did in doc_langs else "" + parts.append(f"[Document: {did}{date_str}{lang_str}]\n{ans}") + + # Prepend contextual notes when metadata is present + notes = [] + if doc_dates: + notes.append( + "NOTE: The answers below come from multiple documents with different dates. " + "When documents contain contradictory or overlapping information, " + "prefer the information from the more recently dated document." + ) + unique_langs = set(doc_langs.values()) + if len(unique_langs) > 1: + notes.append( + "NOTE: The answers below come from documents in different languages. " + "Each answer is in its document's language; synthesise accordingly." + ) + if notes: + answer = "\n\n".join(notes) + "\n\n" + "\n\n---\n\n".join(parts) + else: + answer = "\n\n---\n\n".join(parts) + else: + doc_id = doc_ids[0] if doc_ids else None + if not doc_id: + answer = "No accessible documents found for your query." + else: + lang = doc_langs.get(doc_id, "en") + answer = await loop.run_in_executor( + _executor, _query_single_doc_sync, + effective_query, tenant_id, doc_id, config_path, lang, + ) + + # ── Persist assistant message ────────────────────────────────────────────── + await db.append_message(MONGO_URI, MONGO_DB_PREFIX, tenant_id, session_id, + {"role": "assistant", "content": answer}) + + return { + "answer": answer, + "session_id": session_id, + "doc_ids_used": doc_ids, + "rewritten_query": effective_query if effective_query != query else None, + } + diff --git a/api/services/entity_editor.py b/api/services/entity_editor.py new file mode 100644 index 0000000..698460e --- /dev/null +++ b/api/services/entity_editor.py @@ -0,0 +1,468 @@ +"""Entity editor service: rename, merge, split, suggest-merges on NER entities. + +All mutating operations work on the in-memory NetworkX graph (loaded from +graph_data.json), persist changes to graph_data.json *and* FalkorDB (when +configured), then best-effort rebuild the entity VDB so search stays fresh. + +Entities are NOT stored in MongoDB — their source of truth is FalkorDB + +graph_data.json. A lightweight audit entry is written to MongoDB's +``entity_edits`` collection for every mutating operation. +""" +from __future__ import annotations + +import asyncio +import logging +import os +from collections import defaultdict +from difflib import SequenceMatcher +from typing import Dict, List, Optional + +from api.dependencies import ( + FALKORDB_HOST, FALKORDB_PORT, FALKORDB_USERNAME, FALKORDB_PASSWORD, + INDEX_SAVE_DIR, MONGO_URI, MONGO_DB_PREFIX, + THREAD_POOL, +) +from api.db import mongodb as db + +log = logging.getLogger(__name__) +_executor = THREAD_POOL + +# Per-document asyncio lock — keyed by "{tenant_id}:{doc_id}" +_doc_locks: Dict[str, asyncio.Lock] = defaultdict(asyncio.Lock) + +# Dedup tracker: prevents redundant VDB rebuilds when multiple edits fire +# in quick succession on the same document. +_rebuild_pending: set[str] = set() +_rebuild_pending_lock = asyncio.Lock() + + +def _get_lock(tenant_id: str, doc_id: str) -> asyncio.Lock: + return _doc_locks[f"{tenant_id}:{doc_id}"] + + +# ── Graph loader ───────────────────────────────────────────────────────────── + +def _load_graph_sync(tenant_id: str, doc_id: str, config_path: str): + """Load a Graph from JSON (never from FalkorDB) for in-memory editing. + + Returns ``(graph, save_path, falkordb_cfg | None)``. + """ + from Core.configs.system_config import load_system_config + from Core.configs.falkordb_config import FalkorDBConfig + from Core.Index.Graph import Graph + + cfg = load_system_config(config_path) + save_path = os.path.join(INDEX_SAVE_DIR, tenant_id, doc_id) + variant = "basic" if cfg.graph.refine_type == "basic" else None + + # Always load from JSON so we get the full in-memory graph + graph = Graph.load_from_dir( + load_dir=save_path, + variant=variant, + tenant_id=tenant_id, + doc_id=doc_id, + falkordb_cfg=None, # load from JSON only + ) + + # Attach FalkorDB cfg for saving if the host env var is set + falkordb_cfg = None + fdb_host = os.getenv("BOOKRAG_FALKORDB_HOST", "") + if fdb_host: + falkordb_cfg = FalkorDBConfig( + host=FALKORDB_HOST, + port=FALKORDB_PORT, + username=FALKORDB_USERNAME, + password=FALKORDB_PASSWORD, + ) + graph.falkordb_cfg = falkordb_cfg + graph.tenant_id = tenant_id + graph.doc_id = doc_id + graph.use_falkordb = True + graph._fdb_graph_name = falkordb_cfg.graph_name_for_doc(tenant_id, doc_id) + + return graph, save_path, falkordb_cfg + + +def _rebuild_vdb_sync(tenant_id: str, doc_id: str, config_path: str) -> None: + """Best-effort VDB rebuild after any graph mutation.""" + try: + from Core.configs.system_config import load_system_config + from Core.configs.falkordb_config import FalkorDBConfig + from Core.Index.GBCIndex import GBC + + cfg = load_system_config(config_path) + cfg.tenant_id = tenant_id + cfg.doc_id = doc_id + cfg.save_path = os.path.join(INDEX_SAVE_DIR, tenant_id, doc_id) + + fdb_host = os.getenv("BOOKRAG_FALKORDB_HOST", "") + if fdb_host: + cfg.falkordb = FalkorDBConfig( + host=FALKORDB_HOST, port=FALKORDB_PORT, + username=FALKORDB_USERNAME, password=FALKORDB_PASSWORD, + ) + + gbc = GBC.load_gbc_index(cfg) + gbc.rebuild_vdb() + log.info(f"VDB rebuilt for {tenant_id}/{doc_id}") + except Exception as exc: + log.warning(f"VDB rebuild failed for {tenant_id}/{doc_id}: {exc}") + + +async def _schedule_vdb_rebuild(tenant_id: str, doc_id: str, config_path: str) -> None: + """Await a VDB rebuild, deduplicating concurrent requests for the same doc. + + If a rebuild is already in-flight for this ``tenant_id:doc_id``, the call + is skipped (the already-running rebuild will pick up the latest graph JSON). + """ + key = f"{tenant_id}:{doc_id}" + async with _rebuild_pending_lock: + if key in _rebuild_pending: + log.debug(f"VDB rebuild already pending for {key} — skipping") + return + _rebuild_pending.add(key) + try: + loop = asyncio.get_event_loop() + await loop.run_in_executor(_executor, _rebuild_vdb_sync, tenant_id, doc_id, config_path) + finally: + async with _rebuild_pending_lock: + _rebuild_pending.discard(key) + + +# ── List entities ───────────────────────────────────────────────────────────── + +def _list_entities_sync(tenant_id: str, doc_id: str, config_path: str) -> List[dict]: + graph, _, _ = _load_graph_sync(tenant_id, doc_id, config_path) + result = [] + for node_name in graph.get_all_nodes(): + entity = graph.get_entity_by_node_name(node_name) + result.append({ + "entity_name": entity.entity_name, + "entity_type": entity.entity_type, + "description": entity.description, + "source_ids": sorted(entity.source_ids), + "node_name": node_name, + }) + return sorted(result, key=lambda e: e["entity_name"].lower()) + + +async def list_entities(tenant_id: str, doc_id: str, config_path: str) -> List[dict]: + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + _executor, _list_entities_sync, tenant_id, doc_id, config_path + ) + + +# ── Rename entity ───────────────────────────────────────────────────────────── + +def _rename_sync( + tenant_id: str, doc_id: str, config_path: str, + entity_name: str, entity_type: str, + new_entity_name: str, new_entity_type: str, new_description: Optional[str], +) -> List[dict]: + from Core.Index.Graph import Entity + + graph, _, _ = _load_graph_sync(tenant_id, doc_id, config_path) + old_entity = graph.get_entity(entity_name, entity_type) + + effective_type = new_entity_type if new_entity_type else old_entity.entity_type + effective_desc = new_description if new_description is not None else old_entity.description + + new_entity = Entity( + entity_name=new_entity_name, + entity_type=effective_type, + description=effective_desc, + source_ids=old_entity.source_ids, + ) + graph.update_entity(entity_name, entity_type, new_entity) + graph.save_graph() + + new_node = graph.get_node_name_from_str(new_entity_name, effective_type) + return [{ + "entity_name": new_entity_name, + "entity_type": effective_type, + "description": effective_desc, + "source_ids": sorted(new_entity.source_ids), + "node_name": new_node, + }] + + +async def rename_entity( + tenant_id: str, doc_id: str, config_path: str, + entity_name: str, entity_type: str, + new_entity_name: str, new_entity_type: str, new_description: Optional[str], + user_id: str, +) -> List[dict]: + async with _get_lock(tenant_id, doc_id): + loop = asyncio.get_event_loop() + result = await loop.run_in_executor( + _executor, _rename_sync, + tenant_id, doc_id, config_path, + entity_name, entity_type, new_entity_name, new_entity_type, new_description, + ) + await db.log_entity_edit(MONGO_URI, MONGO_DB_PREFIX, tenant_id, { + "operation": "rename", "doc_id": doc_id, "user_id": user_id, + "before": {"entity_name": entity_name, "entity_type": entity_type}, + "after": {"entity_name": new_entity_name, "entity_type": new_entity_type or entity_type}, + }) + await _schedule_vdb_rebuild(tenant_id, doc_id, config_path) + return result + + +# ── Split entity ────────────────────────────────────────────────────────────── + +def _split_sync( + tenant_id: str, doc_id: str, config_path: str, + entity_name: str, entity_type: str, + new_entities: List[dict], + edge_mode: str, +) -> List[dict]: + from Core.Index.Graph import Entity + + graph, _, _ = _load_graph_sync(tenant_id, doc_id, config_path) + old_node = graph.get_node_name_from_str(entity_name, entity_type) + if old_node not in graph.kg: + raise KeyError(f"Entity '{old_node}' not found in graph.") + + old_entity = graph.get_entity_by_node_name(old_node) + old_neighbors = list(graph.kg.neighbors(old_node)) + old_edge_data = {n: graph.kg.get_edge_data(old_node, n) for n in old_neighbors} + + created: List[dict] = [] + for spec in new_entities: + spec_name = spec["entity_name"] + spec_type = spec["entity_type"] + spec_desc = spec.get("description") or old_entity.description + spec_sids = set(spec.get("source_ids") or old_entity.source_ids) + + new_node = graph.get_node_name_from_str(spec_name, spec_type) + graph.add_kg_node(Entity( + entity_name=spec_name, entity_type=spec_type, + description=spec_desc, source_ids=spec_sids, + )) + + if edge_mode == "duplicate": + for neighbor, edata in old_edge_data.items(): + if neighbor != old_node and not graph.kg.has_edge(new_node, neighbor): + graph.kg.add_edge(new_node, neighbor, **edata) + + for tree_id in spec_sids: + graph.tree2kg[tree_id].add(new_node) + + created.append({ + "entity_name": spec_name, "entity_type": spec_type, + "description": spec_desc, "source_ids": sorted(spec_sids), "node_name": new_node, + }) + + for _, nodes in graph.tree2kg.items(): + nodes.discard(old_node) + graph.kg.remove_node(old_node) + graph.save_graph() + return created + + +async def split_entity( + tenant_id: str, doc_id: str, config_path: str, + entity_name: str, entity_type: str, + new_entities: List[dict], edge_mode: str, + user_id: str, +) -> List[dict]: + async with _get_lock(tenant_id, doc_id): + loop = asyncio.get_event_loop() + result = await loop.run_in_executor( + _executor, _split_sync, + tenant_id, doc_id, config_path, + entity_name, entity_type, new_entities, edge_mode, + ) + await db.log_entity_edit(MONGO_URI, MONGO_DB_PREFIX, tenant_id, { + "operation": "split", "doc_id": doc_id, "user_id": user_id, + "before": {"entity_name": entity_name, "entity_type": entity_type}, + "after": [{"entity_name": e["entity_name"], "entity_type": e["entity_type"]} for e in result], + }) + await _schedule_vdb_rebuild(tenant_id, doc_id, config_path) + return result + + +# ── Suggest merge candidates ────────────────────────────────────────────────── + +def _suggest_merges_sync( + tenant_id: str, doc_id: str, config_path: str, + min_score: float, top_k: int, use_embeddings: bool, +) -> List[dict]: + graph, _, _ = _load_graph_sync(tenant_id, doc_id, config_path) + nodes = list(graph.get_all_nodes()) + entities = [] + for node in nodes: + ent = graph.get_entity_by_node_name(node) + entities.append({"entity_name": ent.entity_name, "entity_type": ent.entity_type, "node": node}) + + suggestions: List[dict] = [] + + # Pre-group entities by type so we only compare within same-type groups + # This reduces O(n²) to O(Σ nᵢ²) where nᵢ is entity count per type + by_type: Dict[str, List[dict]] = defaultdict(list) + for ent in entities: + by_type[ent["entity_type"]].append(ent) + + # String similarity — only within same-type groups + for _etype, group in by_type.items(): + n = len(group) + for i in range(n): + for j in range(i + 1, n): + a, b = group[i], group[j] + score = SequenceMatcher(None, a["entity_name"].lower(), b["entity_name"].lower()).ratio() + if score >= min_score: + suggestions.append({ + "entity_a": {"entity_name": a["entity_name"], "entity_type": a["entity_type"]}, + "entity_b": {"entity_name": b["entity_name"], "entity_type": b["entity_type"]}, + "score": round(score, 4), + "method": "string_similarity", + }) + + # Embedding similarity (optional) + if use_embeddings: + try: + from Core.configs.system_config import load_system_config + from Core.Index.GBCIndex import GBC + + cfg = load_system_config(config_path) + cfg.tenant_id = tenant_id + cfg.doc_id = doc_id + cfg.save_path = os.path.join(INDEX_SAVE_DIR, tenant_id, doc_id) + gbc = GBC.load_gbc_index(cfg) + + seen_pairs: set = set() + for ent in entities: + for hit in gbc.entity_vdb.search(ent["node"], top_k=5): + sim = 1.0 - hit["distance"] + if sim < min_score: + continue + meta = hit.get("metadata", {}) + b_name = meta.get("entity_name", "") + b_type = meta.get("entity_type", "") + if (b_name == ent["entity_name"] and b_type == ent["entity_type"]) or b_type != ent["entity_type"]: + continue + pair = tuple(sorted([(ent["entity_name"], ent["entity_type"]), (b_name, b_type)])) + if pair in seen_pairs: + continue + seen_pairs.add(pair) + suggestions.append({ + "entity_a": {"entity_name": ent["entity_name"], "entity_type": ent["entity_type"]}, + "entity_b": {"entity_name": b_name, "entity_type": b_type}, + "score": round(sim, 4), + "method": "embedding_similarity", + }) + except Exception as exc: + log.warning(f"Embedding suggestions failed: {exc}") + + suggestions.sort(key=lambda s: s["score"], reverse=True) + return suggestions[:top_k] + + +async def suggest_merges( + tenant_id: str, doc_id: str, config_path: str, + min_score: float = 0.80, + top_k: int = 50, + use_embeddings: bool = False, +) -> List[dict]: + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + _executor, _suggest_merges_sync, + tenant_id, doc_id, config_path, min_score, top_k, use_embeddings, + ) +# ── Merge entities ──────────────────────────────────────────────────────────── + +def _merge_sync( + tenant_id: str, doc_id: str, config_path: str, + source_entities: List[dict], + canonical_name: str, canonical_type: str, canonical_desc: str, +) -> List[dict]: + from Core.Index.Graph import Entity + + graph, _, _ = _load_graph_sync(tenant_id, doc_id, config_path) + + # Collect all source_ids from entities being merged + merged_source_ids: set = set() + for src in source_entities: + try: + ent = graph.get_entity(src["entity_name"], src["entity_type"]) + merged_source_ids.update(ent.source_ids) + except KeyError: + log.warning(f"Merge: source entity not found: {src}") + + canonical_node = graph.get_node_name_from_str(canonical_name, canonical_type) + + # Ensure canonical node exists (may be one of the sources or brand new) + if canonical_node not in graph.kg: + canonical_entity = Entity( + entity_name=canonical_name, + entity_type=canonical_type, + description=canonical_desc, + source_ids=merged_source_ids, + ) + graph.add_kg_node(canonical_entity) + else: + # Update description and source_ids on the existing node + existing = graph.get_entity_by_node_name(canonical_node) + merged_source_ids.update(existing.source_ids) + updated = Entity( + entity_name=canonical_name, + entity_type=canonical_type, + description=canonical_desc or existing.description, + source_ids=merged_source_ids, + ) + graph.kg.nodes[canonical_node].update(updated.model_dump()) + + # Transfer edges from each source to canonical, then remove source + for src in source_entities: + src_node = graph.get_node_name_from_str(src["entity_name"], src["entity_type"]) + if src_node == canonical_node or src_node not in graph.kg: + continue + for neighbor in list(graph.kg.neighbors(src_node)): + if neighbor == canonical_node: + continue + edge_data = graph.kg.get_edge_data(src_node, neighbor) + if not graph.kg.has_edge(canonical_node, neighbor): + graph.kg.add_edge(canonical_node, neighbor, **edge_data) + # Update tree2kg + for tree_id, nodes in graph.tree2kg.items(): + if src_node in nodes: + nodes.discard(src_node) + nodes.add(canonical_node) + graph.kg.remove_node(src_node) + + # Persist source_ids on canonical node + graph.kg.nodes[canonical_node]["source_ids"] = list(merged_source_ids) + graph.save_graph() + + canonical_ent = graph.get_entity_by_node_name(canonical_node) + return [{ + "entity_name": canonical_ent.entity_name, + "entity_type": canonical_ent.entity_type, + "description": canonical_ent.description, + "source_ids": sorted(canonical_ent.source_ids), + "node_name": canonical_node, + }] + + +async def merge_entities( + tenant_id: str, doc_id: str, config_path: str, + source_entities: List[dict], + canonical_name: str, canonical_type: str, canonical_desc: str, + user_id: str, +) -> List[dict]: + async with _get_lock(tenant_id, doc_id): + loop = asyncio.get_event_loop() + result = await loop.run_in_executor( + _executor, _merge_sync, + tenant_id, doc_id, config_path, + source_entities, canonical_name, canonical_type, canonical_desc, + ) + await db.log_entity_edit(MONGO_URI, MONGO_DB_PREFIX, tenant_id, { + "operation": "merge", "doc_id": doc_id, "user_id": user_id, + "before": source_entities, + "after": {"entity_name": canonical_name, "entity_type": canonical_type}, + }) + await _schedule_vdb_rebuild(tenant_id, doc_id, config_path) + return result + diff --git a/api/services/entity_resolution.py b/api/services/entity_resolution.py new file mode 100644 index 0000000..37b5360 --- /dev/null +++ b/api/services/entity_resolution.py @@ -0,0 +1,116 @@ +""" +Phase 3: Cross-document entity resolution pipeline. + +After a document is indexed, this service: +1. Loads document-level graph entities. +2. Searches the tenant-global VDB for cosine-similar canonical candidates. +3. Reuses an existing tenant-global canonical when the similarity gate is met. +4. Persists unmatched entities plus ontology metadata into the tenant-global stores. +""" +import asyncio +import logging +import os + +from api.dependencies import THREAD_POOL + +log = logging.getLogger(__name__) +_executor = THREAD_POOL + + +def _resolve_entities_sync( + tenant_id: str, + doc_id: str, + config_path: str, +): + """ + Synchronous entity resolution — runs in a thread pool after indexing. + + Steps: + 1. Load per-doc GBC index to get all new entities. + 2. Open (or create) global ChromaDB VDB for the tenant. + 3. For each new entity: search global VDB, LLM-verify top match if score > threshold. + 4. If verified merge: MERGE in global FalkorDB graph + update global VDB. + 5. If no match: add as new canonical entity in global VDB + global graph. + """ + from Core.configs.system_config import load_system_config + from Core.Index.GBCIndex import GBC + from Core.provider.vdb import VectorStore + from Core.utils.entity_resolution_utils import ( + build_global_entity_metadata, + should_resolve_entity_globally, + ) + from api.dependencies import INDEX_SAVE_DIR + + cfg = load_system_config(config_path) + resolution_cfg = cfg.entity_resolution + if not resolution_cfg.enabled: + log.info("Entity resolution disabled by config; skipping Phase 3 sync.") + return + + cfg.tenant_id = tenant_id + cfg.doc_id = doc_id + cfg.save_path = os.path.join(INDEX_SAVE_DIR, tenant_id, doc_id) + + falkordb_cfg = None + if resolution_cfg.sync_to_global_graph and getattr(cfg.falkordb, "host", ""): + falkordb_cfg = cfg.falkordb + + gbc = GBC.load_gbc_index(cfg) + graph = gbc.GraphIndex + embedder = gbc.embedder + + # Open global VDB for tenant + global_vdb_path = os.path.join(resolution_cfg.global_vdb_dir, tenant_id, "global_vdb") + global_vdb = VectorStore( + db_path=global_vdb_path, + embedding_model=embedder, + collection_name=resolution_cfg.collection_name, + ) + + nodes = graph.get_all_nodes() + new_canonical_texts = [] + new_canonical_meta = [] + + for node_name in nodes: + entity = graph.get_entity_by_node_name(node_name) + if not should_resolve_entity_globally(entity, resolution_cfg): + continue + + # Search global VDB for similar entity + hits = global_vdb.search(node_name, top_k=resolution_cfg.top_k) + merged = False + if hits and hits[0]["distance"] < (1.0 - resolution_cfg.similarity_threshold): + # Cosine distance is 1 - similarity; low distance = high similarity + canonical_name = hits[0]["content"] + log.info( + f"Entity '{entity.entity_name}' similar to canonical '{canonical_name}' " + f"(dist={hits[0]['distance']:.3f}). Merging." + ) + merged = True + + if not merged: + new_canonical_texts.append(node_name) + new_canonical_meta.append( + build_global_entity_metadata(entity, tenant_id=tenant_id, doc_id=doc_id) + ) + + if new_canonical_texts: + global_vdb.add_texts(texts=new_canonical_texts, metadatas=new_canonical_meta) + log.info(f"Added {len(new_canonical_texts)} new canonical entities to global VDB for tenant '{tenant_id}'.") + + # Push doc graph to global FalkorDB graph (idempotent MERGE) + if falkordb_cfg: + graph.save_to_global_graph(falkordb_cfg, tenant_id) + log.info(f"Global FalkorDB graph updated for tenant '{tenant_id}', doc '{doc_id}'.") + + +async def run_entity_resolution(tenant_id: str, doc_id: str, config_path: str): + """Async entry point for entity resolution — call after indexing completes.""" + loop = asyncio.get_event_loop() + try: + await loop.run_in_executor( + _executor, _resolve_entities_sync, tenant_id, doc_id, config_path + ) + except Exception as e: + log.error(f"Entity resolution failed for doc '{doc_id}': {e}", exc_info=True) + diff --git a/api/services/indexing.py b/api/services/indexing.py new file mode 100644 index 0000000..5acf04c --- /dev/null +++ b/api/services/indexing.py @@ -0,0 +1,77 @@ +"""Background indexing service: PDF → GBC Index.""" +import asyncio +import logging +import os +import shutil + +from api.db import mongodb as db +from api.dependencies import MONGO_URI, MONGO_DB_PREFIX, INDEX_SAVE_DIR, THREAD_POOL + +log = logging.getLogger(__name__) +_executor = THREAD_POOL + + +def _build_index_sync( + pdf_path: str, save_path: str, tenant_id: str, doc_id: str, + config_path: str, document_date=None, document_lang=None, +): + """Synchronous index build — runs in a thread pool.""" + from Core.configs.system_config import load_system_config + from Core.configs.falkordb_config import FalkorDBConfig + from Core.construct_index import construct_gbc_index + + cfg = load_system_config(config_path) + cfg.pdf_path = pdf_path + cfg.save_path = save_path + cfg.tenant_id = tenant_id + cfg.doc_id = doc_id + # Propagate document_date into the config for temporal awareness + if document_date is not None: + cfg.document_date = document_date + # Propagate document_lang into the config for language-aware processing + if document_lang is not None: + cfg.document_lang = document_lang + # FalkorDB will be used if BOOKRAG_FALKORDB_HOST is set + fdb_host = os.getenv("BOOKRAG_FALKORDB_HOST", "") + if fdb_host: + from api.dependencies import FALKORDB_HOST, FALKORDB_PORT, FALKORDB_USERNAME, FALKORDB_PASSWORD + cfg.falkordb = FalkorDBConfig( + host=FALKORDB_HOST, port=FALKORDB_PORT, + username=FALKORDB_USERNAME, password=FALKORDB_PASSWORD, + ) + construct_gbc_index(cfg) + + +async def run_indexing( + tenant_id: str, + doc_id: str, + pdf_path: str, + config_path: str, + document_date=None, + document_lang=None, +): + """Async wrapper: update status in MongoDB before/after indexing.""" + save_path = os.path.join(INDEX_SAVE_DIR, tenant_id, doc_id) + os.makedirs(save_path, exist_ok=True) + + await db.update_document_status(MONGO_URI, MONGO_DB_PREFIX, tenant_id, doc_id, "indexing") + try: + loop = asyncio.get_event_loop() + await loop.run_in_executor( + _executor, + _build_index_sync, + pdf_path, save_path, tenant_id, doc_id, config_path, document_date, + document_lang, + ) + await db.update_document_status(MONGO_URI, MONGO_DB_PREFIX, tenant_id, doc_id, "ready") + log.info(f"Indexing complete for doc '{doc_id}' in tenant '{tenant_id}'") + # Phase 3: Run entity resolution to merge into global graph + try: + from api.services.entity_resolution import run_entity_resolution + await run_entity_resolution(tenant_id, doc_id, config_path) + except Exception as er_err: + log.warning(f"Entity resolution skipped (non-fatal): {er_err}") + except Exception as e: + log.error(f"Indexing failed for doc '{doc_id}': {e}", exc_info=True) + await db.update_document_status(MONGO_URI, MONGO_DB_PREFIX, tenant_id, doc_id, "error", str(e)) + diff --git a/config/bm25.yaml b/config/bm25.yaml index 445aa43..d67475f 100644 --- a/config/bm25.yaml +++ b/config/bm25.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/bm25.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://10.26.1.21:8003/v1 + api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -43,7 +43,7 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: model_name: Qwen3-Embedding-0.6B diff --git a/config/docling.yaml b/config/docling.yaml new file mode 100644 index 0000000..212169b --- /dev/null +++ b/config/docling.yaml @@ -0,0 +1,113 @@ +# config/docling.yaml +# +# BookRAG configuration that uses Docling as the document parser instead of +# MinerU. Suitable for scanned books and multi-format documents (PDF, DOCX, +# PPTX, HTML, images). MinerU settings are still required for the rest of +# the system; they are simply unused at parse time when parser=docling. +# +# Usage: +# python main.py -c config/docling.yaml -d Scripts/cfg/Qasper.yaml index --stage tree + +pdf_path: TODO +save_path: TODO + +# ── Parser selection ──────────────────────────────────────────────────────── +# Set to "docling" to use Docling. Change to "mineru" to fall back to MinerU. +parser: docling + +docling: + # OCR engine: "easyocr" (default) | "tesseract" | "rapidocr" + ocr_engine: easyocr + # Force OCR on every page even when selectable text is present. + # Set to true for scanned documents where text layer is absent or unreliable. + force_full_page_ocr: true + # Image render scale (1.0 ≈ 72 DPI). 2.0 gives crisper figure/table crops. + images_scale: 2.0 + # Language hint for the OCR engine (ISO 639-1). + lang: en + +# ── MinerU (kept for completeness; unused when parser=docling) ────────────── +mineru: + backend: pipeline + method: auto + lang: en + +# ── LLM (used for outline extraction, refinement, summaries) ──────────────── +llm: + model_name: TODO + api_key: TODO + api_base: TODO + backend: openai + max_tokens: 5000 + temperature: 0.1 + frequency_penalty: 0.0 + presence_penalty: 0.0 + max_workers: 8 + +# ── VLM (used for image/table node summaries when tree.node_summary=true) ─── +vlm: + model_name: TODO + api_key: TODO + api_base: TODO + temperature: 0.1 + max_tokens: 6000 + backend: ollama + +# ── Tree index ─────────────────────────────────────────────────────────────── +index: + chunk_size: 512 + overlap: 50 + +tree: + node_keywords: true + node_summary: true + +# ── Knowledge graph ────────────────────────────────────────────────────────── +graph: + extractor_type: "llm" + local_model_name: "en_core_web_sm" + image_description_force: true + max_gleaning: 0 + refine_type: "advanced" + embedding_config: + model_name: TODO + backend: openai + max_length: TODO + device: TODO + api_base: TODO + reranker_config: + model_name: TODO + max_length: 4096 + device: TODO + backend: vllm + api_base: TODO + +# ── Vector database ────────────────────────────────────────────────────────── +vdb: + mm_embedding: true + vdb_dir_name: "Tree_vdb" + collection_name: "TreeVDB" + embedding_config: + model_name: TODO + device: TODO + +rag_force_reprocess: true + +# ── RAG ────────────────────────────────────────────────────────────────────── +rag: + strategy: gbc + varient: standard + topk: 10 + sim_threshold: 0.3 + select_depth: 2 + max_retry: 2 + reranker_config: + model_name: TODO + max_length: TODO + device: TODO + backend: vllm + api_base: TODO + mm_reranker_config: + model_name: TODO + device: TODO + diff --git a/config/gbc.yaml b/config/gbc.yaml index 39cd4fd..12092fa 100644 --- a/config/gbc.yaml +++ b/config/gbc.yaml @@ -2,23 +2,24 @@ pdf_path: TODO save_path: TODO +parser: docling llm: - model_name: Qwen/Qwen3-8B-AWQ - api_key: openai - api_base: http://localhost:8003/v1 + model_name: qwen3.5 + api_key: env + api_base: https://starcore-llm.starcore.co.id/v1/gpt backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL - api_key: openai - api_base: http://localhost:8000/v1 + model_name: /home/user/qwen-model/Qwen2-VL-2B-Instruct + api_key: env + api_base: https://starcore-llm.starcore.co.id/v1/vl temperature: 0.1 max_tokens: 6000 backend: gpt @@ -41,20 +42,22 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: - model_name: Qwen3-Embedding-0.6B + model_name: text-embedding-v3 backend: openai + api_key: env max_length: 4096 - device: "cuda:2" - api_base: "http://localhost:8007/v1" + device: "cpu" + api_base: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" reranker_config: - model_name: Qwen3-Reranker-4B + model_name: /home/user/qwen-model/Qwen3-Reranker-4B max_length: 4096 - device: "cuda:2" - backend: vllm - api_base: "http://localhost:8011/v1" + device: "cpu" + backend: openai + api_base: "https://starcore-llm.starcore.co.id/v1/reranker/v1" + api_key: env vdb: @@ -65,6 +68,36 @@ vdb: model_name: Alibaba-NLP/gme-Qwen2-VL-2B-Instruct device: "cuda:1" +ontology: + enabled: false + mapping_threshold: 1.0 + allow_provisional_entities: true + use_query_resolution: true + entities: + - ontology_id: "product:bookrag" + canonical_name: "bookrag" + entity_type: "PRODUCT" + description: "The BookRAG application and retrieval pipeline." + aliases: + - "book rag" + - "book-rag" + +entity_resolution: + enabled: false + similarity_threshold: 0.85 + top_k: 1 + global_vdb_dir: "./indices" + collection_name: "global_kg_collection" + canonical_only: false + sync_to_global_graph: false + +falkordb: + host: "r-6jissuruar.instance-yc6en3ndn.hc-dx5io0svq.asia-south1.gcp.f2e0a955bb84.cloud" + port: 60480 + username: "falkordb" + password: "3YJ36xM3piI3" + graph_prefix: "bookrag" + rag_force_reprocess: True rag: @@ -75,11 +108,12 @@ rag: select_depth: 2 max_retry: 2 reranker_config: - model_name: Qwen3-Reranker-4B + model_name: /home/user/qwen-model/Qwen3-Reranker-4B max_length: 4096 - device: "cuda:7" - backend: vllm - api_base: "http://localhost:8011/v1" + device: "cpu" + backend: openai + api_base: "https://starcore-llm.starcore.co.id/v1/reranker/v1" + api_key: env mm_reranker_config: model_name: Alibaba-NLP/gme-Qwen2-VL-2B-Instruct device: "cuda:2" diff --git a/config/gbc_wo_er.yaml b/config/gbc_wo_er.yaml index a3261db..ec94a16 100644 --- a/config/gbc_wo_er.yaml +++ b/config/gbc_wo_er.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/gbc_wo_er.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -41,7 +41,7 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "basic" embedding_config: model_name: Qwen3-Embedding-0.6B diff --git a/config/gbc_wo_graph.yaml b/config/gbc_wo_graph.yaml index a0f042c..49b325a 100644 --- a/config/gbc_wo_graph.yaml +++ b/config/gbc_wo_graph.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/gbc_wo_graph.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -41,7 +41,7 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: model_name: Qwen3-Embedding-0.6B diff --git a/config/gbc_wo_plan.yaml b/config/gbc_wo_plan.yaml index aa41b64..dbec96c 100644 --- a/config/gbc_wo_plan.yaml +++ b/config/gbc_wo_plan.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/gbc_wo_plan.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -41,7 +41,7 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: model_name: Qwen3-Embedding-0.6B diff --git a/config/gbc_wo_selector.yaml b/config/gbc_wo_selector.yaml index b6e1125..70b6e1c 100644 --- a/config/gbc_wo_selector.yaml +++ b/config/gbc_wo_selector.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/gbc_wo_selector.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -41,7 +41,7 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: model_name: Qwen3-Embedding-0.6B diff --git a/config/gbc_wo_text.yaml b/config/gbc_wo_text.yaml index 3550c46..ca4c939 100644 --- a/config/gbc_wo_text.yaml +++ b/config/gbc_wo_text.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/gbc_wo_text.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -41,7 +41,7 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: model_name: Qwen3-Embedding-0.6B diff --git a/config/graph.yaml b/config/graph.yaml index b659c61..cb3cd32 100644 --- a/config/graph.yaml +++ b/config/graph.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/graph.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://10.26.1.21:8003/v1 + api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -41,7 +41,7 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: model_name: Qwen3-Embedding-0.6B @@ -62,7 +62,7 @@ vdb: collection_name: "TreeVDB" embedding_config: model_name: Alibaba-NLP/gme-Qwen2-VL-2B-Instruct - device: "cuda:6" + device: "cuda:1" rag_force_reprocess: True diff --git a/config/mm.yaml b/config/mm.yaml index fab9c46..15aff2e 100644 --- a/config/mm.yaml +++ b/config/mm.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/mm.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -41,17 +41,20 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: - model_name: Qwen/Qwen3-Embedding-0.6B - backend: local - max_length: 8192 - device: "cuda:7" + model_name: Qwen3-Embedding-0.6B + backend: openai + max_length: 4096 + device: "cuda:2" + api_base: "http://localhost:8007/v1" reranker_config: - model_name: Qwen/Qwen3-Reranker-4B - max_length: 8192 - device: "cuda:7" + model_name: Qwen3-Reranker-4B + max_length: 4096 + device: "cuda:2" + backend: vllm + api_base: "http://localhost:8011/v1" vdb: mm_embedding: True @@ -59,7 +62,7 @@ vdb: collection_name: "TreeVDB" embedding_config: model_name: Alibaba-NLP/gme-Qwen2-VL-2B-Instruct - device: "cuda:3" + device: "cuda:1" rag_force_reprocess: True rag: diff --git a/config/pdf_vanilla.yaml b/config/pdf_vanilla.yaml index 1d7edfb..98dfec8 100644 --- a/config/pdf_vanilla.yaml +++ b/config/pdf_vanilla.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/pdf_vanilla.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -43,7 +43,7 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: model_name: Qwen3-Embedding-0.6B diff --git a/config/raptor.yaml b/config/raptor.yaml index c6730e4..e6158c1 100644 --- a/config/raptor.yaml +++ b/config/raptor.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/raptor.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://10.26.1.21:8003/v1 + api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -43,7 +43,7 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: model_name: Qwen3-Embedding-0.6B diff --git a/config/tree.yaml b/config/tree.yaml index 5b14745..a719d31 100644 --- a/config/tree.yaml +++ b/config/tree.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/tree.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://10.26.1.21:8003/v1 + api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -41,25 +41,28 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: - model_name: Qwen/Qwen3-Embedding-0.6B - backend: local - max_length: 8192 - device: "cuda:5" + model_name: Qwen3-Embedding-0.6B + backend: openai + max_length: 4096 + device: "cuda:2" + api_base: "http://localhost:8007/v1" reranker_config: - model_name: Qwen/Qwen3-Reranker-4B - max_length: 8192 - device: "cuda:5" + model_name: Qwen3-Reranker-4B + max_length: 4096 + device: "cuda:2" + backend: vllm + api_base: "http://localhost:8011/v1" vdb: mm_embedding: True - vdb_path: "Tree_vdb" + vdb_dir_name: "Tree_vdb" collection_name: "TreeVDB" embedding_config: model_name: Alibaba-NLP/gme-Qwen2-VL-2B-Instruct - device: "cuda:4" + device: "cuda:1" rag_force_reprocess: True rag: diff --git a/config/vanilla.yaml b/config/vanilla.yaml index 6816e67..7a83a1d 100644 --- a/config/vanilla.yaml +++ b/config/vanilla.yaml @@ -1,24 +1,24 @@ -# configs/default.yaml +# config/vanilla.yaml pdf_path: TODO save_path: TODO llm: - model_name: Qwen/Qwen3-8B-AWQ + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://10.26.1.21:8003/v1 + api_base: http://localhost:8003/v1 backend: openai - max_tokens: 5000 + max_tokens: 8000 temperature: 0.1 frequency_penalty: 0.0 presence_penalty: 0.0 - max_workers: 8 + max_workers: 4 vlm: - model_name: Qwen2-5-VL + model_name: Qwen/Qwen3.5-35B-A3B-AWQ api_key: openai - api_base: http://localhost:8000/v1 + api_base: http://localhost:8003/v1 temperature: 0.1 max_tokens: 6000 backend: gpt @@ -43,7 +43,7 @@ graph: extractor_type: "llm" local_model_name: "en_core_web_sm" image_description_force: True - max_gleaning: 0 + max_gleaning: 1 refine_type: "advanced" embedding_config: model_name: Qwen3-Embedding-0.6B diff --git a/docs/bookrag-architecture-review.md b/docs/bookrag-architecture-review.md new file mode 100644 index 0000000..19ac848 --- /dev/null +++ b/docs/bookrag-architecture-review.md @@ -0,0 +1,883 @@ +# BookRAG Architecture Review and Comparison with Three-Layer Fixed Entity Architecture + +## 1. Purpose of This Document + +This document explains how the current BookRAG application is implemented, how its retrieval pipeline works end to end, and how it compares with the "Three-Layer Fixed Entity Architecture" (FEA) pattern for graph-based RAG. + +The goal is to answer three practical questions: + +1. What is BookRAG doing today? +2. How is it similar to and different from FEA? +3. Can FEA ideas improve BookRAG, and if so, where? + +## 2. Executive Summary + +BookRAG is not a pure GraphRAG system and it is not an ontology-first system. It is better described as a **hybrid hierarchical-document + knowledge-graph + vector retrieval architecture** with query planning and multimodal answer generation. + +Its strongest architectural characteristics are: + +- **Document structure first**: PDF content is converted into a hierarchical `DocumentTree`. +- **Graph derived from structure**: the knowledge graph is extracted from tree nodes, not from flat chunks alone. +- **Hybrid retrieval**: answers are grounded using tree structure, graph connectivity, and vector similarity together. +- **Query-aware orchestration**: the system classifies queries into simple, complex, and global modes. +- **Multimodal answering**: text, tables, and images can all contribute to the final answer. + +Compared with FEA, BookRAG already shares the idea that **entities bridge user questions and source evidence**. However, BookRAG does **not** currently implement a stable canonical ontology layer above extracted entities. That is the main conceptual gap. + +## 3. What the Three-Layer Fixed Entity Architecture Means + +The Three-Layer Fixed Entity Architecture is best understood as an **ontology-first graph RAG pattern** with three conceptual layers: + +It does not appear to be a mainstream standard framework in the academic GraphRAG literature. In practice, it is more useful to treat it as a **design pattern** for domain-governed graph retrieval. + +1. **Ontology / fixed entity layer** + Stable canonical entities, types, and controlled relationships. +2. **Document layer** + Source documents, passages, or chunks that contain evidence. +3. **Extracted entity / mention layer** + Mentions or extracted entities from documents, linked upward to canonical entities and downward to source evidence. + +The design goal is to reduce noisy entity duplication, improve explainability, and make graph retrieval more precise in domain-specific settings. + +## 4. High-Level BookRAG Architecture + +At a high level, BookRAG has two major operating modes: + +- **Offline indexing**: parse documents, build a structural tree, extract/refine a graph, and build vector resources. +- **Online RAG**: analyze the query, map it to graph/tree evidence, retrieve relevant substructures, and generate an answer. + +Conceptually, the pipeline is: + +`PDF -> parser -> refined PDF structure -> DocumentTree -> KG extraction/refinement -> Graph + tree-to-graph links -> vector indices -> query planning -> entity mapping -> section/subtree/subgraph retrieval -> reranking -> answer generation` + +## 5. Main Implementation Surfaces + +### 5.1 CLI entry point + +`main.py` is the main batch and single-document entry point. + +It supports: + +- `index` mode for offline construction +- `rag` mode for inference +- staged indexing via `--stage` +- dataset-driven multi-document processing +- split-based processing for parallel workers +- config snapshotting and run logging + +This makes the project operationally closer to a **pipeline system** than a demo-only chatbot. + +### 5.2 API entry point + +`api/main.py` exposes BookRAG as a FastAPI service with: + +- startup config validation +- structured JSON logging +- request ID propagation +- MongoDB lifecycle/index management +- optional FalkorDB health checking +- routers for auth, tenants, documents, chat, and entities + +This indicates the repository is designed for **multi-tenant application deployment**, not only offline experiments. + +## 6. Core Data Structures + +### 6.1 Document tree + +`Core/Index/Tree.py` defines the structural backbone: + +- `DocumentTree` +- `TreeNode` +- `NodeType` +- `MetaInfo` + +This tree stores document hierarchy such as titles, sections, text blocks, tables, and images. It also supports subtree extraction, ancestor navigation, depth-aware traversal, and retrieval-friendly node serialization. + +This is the first major difference from FEA: **BookRAG is tree-first, not ontology-first**. + +### 6.2 Knowledge graph + +`Core/Index/Graph.py` defines the graph layer: + +- `Entity` +- `Relationship` +- `Graph` + +The graph is backed by `networkx.Graph` in memory and can be saved or reconstructed through FalkorDB. A key implementation detail is the `tree2kg` mapping, which links structural document nodes to graph nodes. + +That mapping is crucial because BookRAG does not retrieve graph evidence in isolation. It uses the graph to lead back into the source document structure. + +### 6.3 Hybrid GBC index + +`Core/Index/GBCIndex.py` packages the main retrieval assets into a unified object: + +- tree index +- graph index +- entity vector database + +This confirms that the production architecture is **hybrid by design**, rather than a single-layer graph database solution. + +## 7. Offline Index Construction Flow + +The main offline orchestration entry is `Core/construct_index.py`, which coordinates tree building, KG construction, GBC index creation, and vector resource rebuilding. + +### 7.1 Tree construction + +`Core/pipelines/doc_tree_builder.py` builds the `DocumentTree`. + +Important behaviors include: + +- parser selection between **Docling** and **MinerU** +- cache-aware parsing +- PDF refinement +- outline extraction +- legal heading detection +- optional node summary generation + +This stage turns raw PDFs into a structured hierarchy that later retrieval can reason over. + +### 7.2 KG construction + +`Core/pipelines/kg_builder.py` extracts a graph from the tree. + +Important characteristics: + +- extraction is tree-node aware +- title nodes are handled differently from non-title nodes +- graph refinement supports at least `basic` and `advanced` modes +- post-processing refines both entities and relations + +The key design principle is: **the graph is derived from the document tree rather than replacing it**. + +### 7.3 Vector resources + +After tree and graph construction, the system rebuilds vector resources, especially entity-oriented lookup resources used during query-to-graph mapping. + +This gives BookRAG three retrieval surfaces at runtime: + +1. document hierarchy +2. graph connectivity +3. semantic vector similarity + +## 8. Online Retrieval and Answer Generation Flow + +### 8.1 Inference orchestration + +`Core/inference.py` prepares dependencies, creates the configured RAG agent, runs the query dataset, stores per-query outputs, and records token cost. + +This operational layer is important because it shows BookRAG is structured as a repeatable evaluation pipeline, not only a live chat handler. + +### 8.2 Main RAG implementation + +The most representative runtime path is `Core/rag/gbc_rag.py`. + +There is also a simpler `Core/rag/graph_rag.py`, but it is not the best file for understanding the full BookRAG architecture. The `GBCRAG` path is the real reference implementation for the current system design. + +Its main logic combines: + +- `TaskPlanner` +- `Retriever` +- `AnswerAgent` + +This is significantly more advanced than a basic entity-hop graph retriever. + +### 8.3 Query understanding + +`Core/rag/gbc_plan.py` classifies queries into: + +- `simple` +- `complex` +- `global` + +Complex queries can be decomposed into sub-questions, while global queries can include filtering and aggregation logic. + +This is one of BookRAG's strongest differentiators. FEA usually describes graph organization, but BookRAG also includes **query planning as a first-class runtime component**. + +### 8.4 Query entity mapping + +In `gbc_rag.py`, the system extracts or retrieves query entities, normalizes them, and maps them to graph nodes using a combination of LLM reasoning and vector retrieval. + +This is the part of the architecture that is closest in spirit to FEA: the query is translated into an entity-centered access path through the graph. + +### 8.5 Section and subtree retrieval + +BookRAG does not stop at graph hits. It maps graph nodes back to tree nodes, promotes them to relevant section ancestors, and may supplement them with LLM-based section selection. + +This is a major design choice: + +- graph nodes help identify relevant concepts +- tree structure recovers coherent document context +- subtree retrieval preserves hierarchical evidence around the hit + +This is stronger than flat chunk retrieval and also different from a pure ontology graph lookup. + +### 8.6 Graph and text reranking + +`Core/rag/gbc_retrieval.py` provides three major behaviors: + +- text reranking +- graph reranking +- skyline filtering + +The graph reranker uses query-entity similarity, graph enhancement, and personalized PageRank-style scoring. The skyline stage then combines graph and text signals instead of depending on only one scoring channel. + +An important nuance: comments referring to a "Three Layer Reranker" in retrieval code are about reranking signals, **not** the Three-Layer Fixed Entity Architecture. + +### 8.7 Answer generation + +`Core/rag/gbc_answer.py` handles final answer construction. + +It supports: + +- text evidence +- table-to-text conversion +- image-aware reasoning with VLM support +- chunked prompting under token budgets +- synthesis across partial answers +- map-reduce style answering for global questions + +This means BookRAG is not only a retriever architecture; it is also a **multimodal answer synthesis system**. + +## 9. Configuration and Storage Model + +`config/gbc.yaml` shows the default configuration shape. + +Important implementation themes are: + +- `parser: docling` +- graph refine mode set to `advanced` +- separate LLM, VLM, embedding, and reranker configuration +- vector database configuration +- FalkorDB connectivity for graph persistence +- `rag.strategy: gbc` +- `topk`, `select_depth`, and retry controls for retrieval behavior + +The configuration confirms that BookRAG is intended to run as a configurable production-style system, not as a hard-coded prototype. + +## 10. Similarities Between BookRAG and FEA + +BookRAG and FEA are similar in several important ways. + +### 10.1 Entities are the bridge between questions and evidence + +In both designs, entities are central to retrieval. The user query is interpreted in terms of entities or concepts, and those entities connect the query to evidence. + +### 10.2 Evidence remains anchored to source documents + +Neither design treats the graph as sufficient by itself. Both depend on linking graph-level concepts back to grounded document evidence. + +### 10.3 Graph structure improves over naive vector-only retrieval + +Both architectures use graph relationships to improve retrieval quality, especially for multi-hop, concept-heavy, or relationship-sensitive questions. + +### 10.4 Normalization matters + +Both approaches implicitly depend on entity normalization. BookRAG does it through extraction/refinement/vector matching; FEA emphasizes doing it through a stable canonical layer. + +## 11. Differences Between BookRAG and FEA + +### 11.0 Compact comparison matrix + +| Dimension | BookRAG | Three-Layer FEA | +|---|---|---| +| Primary organizing principle | Document hierarchy | Canonical ontology/entities | +| Source grounding | Tree nodes and sections | Documents/chunks linked to entities | +| Graph identity | Extracted/refined entities | Fixed canonical entities + mentions | +| Runtime retrieval | Hybrid tree + graph + vector | Usually graph/entity-centered | +| Query planning | Strong | Usually out of scope | +| Multimodal answering | Present | Not a defining feature | +| Best strength | Long structured document reasoning | Precision and normalization in fixed domains | + +### 11.1 BookRAG is tree-first; FEA is ontology-first + +This is the most important difference. + +- **BookRAG** starts by building a structural document tree. +- **FEA** starts by defining a fixed conceptual entity layer. + +In BookRAG, graph meaning is downstream of document structure. In FEA, document meaning is organized around canonical domain entities. + +### 11.2 BookRAG does not yet have a fixed canonical ontology layer + +BookRAG extracts entities from documents and refines them, but it does not clearly enforce a stable ontology layer containing canonical entity definitions, allowed types, and controlled relation schemas. + +This can lead to: + +- duplicate entities across documents +- inconsistent typing +- weaker cross-document consolidation +- harder governance in domain-specific deployments + +### 11.3 BookRAG is operationally richer than FEA + +FEA is mainly a graph organization pattern. BookRAG includes several runtime capabilities that go beyond that: + +- query planning and decomposition +- section-depth selection +- subtree-induced subgraph retrieval +- skyline fusion of graph and text ranking +- multimodal answer generation +- batch CLI and multi-tenant API deployment + +So FEA is narrower and more schema-centric, while BookRAG is broader and more end-to-end. + +### 11.4 BookRAG optimizes context coherence through sections + +FEA often focuses on entity-document grounding. BookRAG goes further by recovering section-level and subtree-level context from a document hierarchy. + +For long complex PDFs, this is a major advantage. + +## 12. Can FEA Improve BookRAG? + +Yes, but not as a replacement. The best use of FEA is to **strengthen BookRAG's graph and normalization layer** while preserving BookRAG's tree-first retrieval strengths. + +The right framing is: + +- **keep** BookRAG's document tree, planner, reranker, and answer synthesis +- **add** a canonical ontology/fixed-entity layer above the current extracted graph + +## 13. Recommended Improvement Areas + +### 13.1 Add a canonical entity layer above extracted entities + +Introduce a persistent canonical layer with: + +- controlled entity types +- canonical IDs +- synonym/alias tables +- relation constraints +- optional domain taxonomy + +Then map extracted entities to canonical nodes rather than treating extracted surface forms as the final graph identity. + +This would be the closest direct adoption of FEA. + +### 13.2 Separate mention nodes from canonical entity nodes + +A stronger graph pattern would be: + +- **canonical entity nodes** +- **document/tree section nodes** +- **mention or extracted instance nodes** + +This would make BookRAG more explainable and better at traceability, especially when multiple documents mention the same concept differently. + +### 13.3 Use ontology guidance during KG refinement + +The current refinement process could be improved by validating extracted entities and relationships against canonical schema rules. + +This could reduce: + +- noisy relation creation +- inconsistent labels +- entity fragmentation + +### 13.4 Improve cross-document and global retrieval + +`Graph.py` already hints at global graph support through methods for global graph save/load behavior. FEA-style canonical entities would make those global capabilities much stronger. + +This is especially valuable for: + +- multi-document reasoning +- tenant-wide knowledge consolidation +- repeated entities across books or regulations + +### 13.5 Keep the tree as the primary evidence surface + +Even after adopting FEA ideas, BookRAG should continue using the document tree as the main evidence recovery structure. + +This is important because the tree preserves: + +- section boundaries +- local context +- layout-aware structure +- multimodal content placement + +Replacing the tree with a graph-only ontology design would weaken BookRAG's long-document strengths. + +## 14. Recommended Target Architecture + +The best future architecture is a **four-part hybrid**: + +1. **Canonical ontology/entity layer** for stable domain identity +2. **Mention/extracted entity layer** for document-grounded extractions +3. **DocumentTree layer** for structural evidence recovery +4. **Hybrid runtime retriever** for graph + tree + vector + planner orchestration + +In other words, BookRAG should not become "FEA instead of GBC". It should become **GBC with an ontology-governed canonical entity layer**. + +## 15. Final Assessment + +### 15.1 What BookRAG already does well + +- Strong handling of complex document structure +- Hybrid retrieval rather than graph-only retrieval +- Query planning for different question types +- Section-aware evidence recovery +- Multimodal answer generation +- Practical CLI and API deployment surfaces + +### 15.2 What FEA contributes + +- canonical identity +- ontology governance +- cleaner cross-document consolidation +- stronger explainability for entity normalization +- better domain precision in regulated or specialized corpora + +### 15.3 Overall conclusion + +BookRAG is already architecturally more comprehensive than the Three-Layer Fixed Entity Architecture as a full application system. However, FEA offers an important improvement in one specific area: **a fixed canonical ontology/entity layer that stabilizes the graph**. + +Therefore, the recommendation is: + +- **Do not replace the current BookRAG design with FEA.** +- **Adopt FEA principles to improve entity canonicalization, ontology control, and global graph consistency.** + +That would preserve BookRAG's current strengths while addressing one of the clearest structural opportunities for improvement. + +## 16. Ontology-Driven Extension Proposal + +The user suggestion is directionally correct: **allow users to define ontology entities with descriptions, then use those entities as the base canonical layer during indexing**. + +As an architectural recommendation, this should be implemented as a **guided open-world design**, not a closed-world design. + +That means: + +- user-defined ontology entities become the preferred canonical targets +- extracted document entities are mapped onto that ontology when confidence is high +- unmatched entities are still preserved as provisional or local entities +- the ontology can be expanded over time from repeated provisional entities + +This is the safest way to improve precision without harming recall. + +### 16.1 Why this is valuable for BookRAG + +This proposal fits BookRAG especially well because the current system already has: + +- strong document grounding through `DocumentTree` +- a KG extraction and refinement pipeline +- entity vector lookup during retrieval +- multi-document and global-graph ambitions in `Graph.py` + +What it lacks most clearly is a **stable canonical identity layer** shared across document-specific mentions. + +### 16.2 Recommended design principle + +The right principle is: + +1. **Ontology entity** = stable canonical concept defined by the user +2. **Mention entity** = what the extractor found in a specific document node +3. **Evidence node** = the `DocumentTree` node that grounds the mention + +The ontology should be the **base layer**, but not the **only allowed layer**. + +### 16.2.1 Mermaid architecture sketch + +```mermaid +flowchart LR + O[User ontology\ncanonical entities] --> A[Ontology alignment] + D[DocumentTree nodes] --> E[KG extraction] + E --> A + A --> K[Canonical and provisional KG entities] + K --> V[Entity VDB] + Q[User query] --> R[Query entity extraction] + R --> F[Ontology-first resolution] + F --> K + K --> T[Linked tree evidence] + T --> S[BookRAG answer synthesis] +``` + +### 16.3 Why entity descriptions matter + +Descriptions are not optional metadata; they are part of the matching signal. + +They help disambiguate cases such as: + +- the same alias referring to different concepts +- domain-specific abbreviations +- entities with similar names but different roles + +For this reason, user-provided ontology descriptions should be used in: + +- indexing-time entity alignment +- query-time entity resolution +- ambiguity handling when multiple ontology entities are plausible + +## 17. Concrete Schema for User-Defined Ontology + +The schema should be tenant-scoped and explicitly designed for canonicalization. + +### 17.1 Core ontology entity schema + +| Field | Required | Type | Purpose | +|---|---|---|---| +| `ontology_id` | Yes | `str` | Stable canonical identifier used across documents and queries | +| `canonical_name` | Yes | `str` | Preferred human-readable entity name | +| `entity_type` | Yes | `str` | Controlled type such as `PERSON`, `ORG`, `LAW`, `PRODUCT`, `CLAUSE` | +| `description` | Yes | `str` | Meaning/definition used for disambiguation and retrieval | +| `aliases` | Yes | `list[str]` | Synonyms, abbreviations, alternate spellings | +| `keywords` | No | `list[str]` | Supporting lexical terms for retrieval and matching | +| `parent_ids` | No | `list[str]` | Optional taxonomy or hierarchical ontology support | +| `allowed_relation_types` | No | `list[str]` | Optional whitelist of relations this entity can participate in | +| `examples` | No | `list[str]` | Example mentions or phrases found in documents | +| `status` | No | `str` | `active`, `draft`, `deprecated` | +| `tenant_id` | Yes | `str` | Scope for multi-tenant isolation | +| `metadata` | No | `dict` | Domain-specific extensions | + +### 17.2 Relation rule schema + +If the ontology is meant to guide relationship refinement, add a relation policy table. + +| Field | Required | Type | Purpose | +|---|---|---|---| +| `relation_type` | Yes | `str` | Canonical relation label | +| `src_entity_type` | Yes | `str` | Allowed source type | +| `tgt_entity_type` | Yes | `str` | Allowed target type | +| `description` | No | `str` | Semantic definition of the relation | +| `directional` | No | `bool` | Whether direction matters | +| `status` | No | `str` | `active`, `draft`, `deprecated` | + +This is useful for constraining noisy KG relations extracted from documents. + +### 17.3 Mention-to-canonical mapping record + +The ontology entity itself is not enough. The system should also persist mapping decisions. + +| Field | Required | Type | Purpose | +|---|---|---|---| +| `mention_text` | Yes | `str` | Surface form found in the document | +| `mention_type` | Yes | `str` | Extracted type before canonicalization | +| `source_tree_node_id` | Yes | `int` | Tree node where the mention appeared | +| `canonical_ontology_id` | No | `str` | Resolved ontology target if matched | +| `match_method` | Yes | `str` | `alias`, `embedding`, `llm`, `rule`, `manual` | +| `match_confidence` | Yes | `float` | Confidence score for the mapping | +| `mapping_status` | Yes | `str` | `matched`, `ambiguous`, `provisional`, `rejected` | + +This record is important for explainability, debugging, and ontology curation. + +### 17.4 Minimum viable ontology payload + +At minimum, every user-provided ontology entity should include: + +- `ontology_id` +- `canonical_name` +- `entity_type` +- `description` +- `aliases` + +That is the smallest schema that is still strong enough to improve indexing quality. + +## 18. Codebase-Specific Integration Plan + +This section ties the ontology proposal to the current implementation in `Core/pipelines/kg_builder.py`, `Core/Index/Graph.py`, and `Core/rag/gbc_rag.py`. + +### 18.1 Integration into `Core/pipelines/kg_builder.py` + +Current flow in `build_knowledge_graph(tree, cfg)` is: + +1. create `Graph` +2. create `KGExtractor` +3. create `KGRefiner` +4. batch extract entities/relations from tree nodes +5. run `basic_kg_refiner(...)` or `advanced_kg_refiner(...)` +6. run `refine_entities()` and `refine_relation()` + +The ontology-aware version should become: + +1. load tenant ontology and relation rules +2. batch extract raw entities/relations from tree nodes +3. align extracted entities to ontology +4. send canonicalized entities into KG refinement +5. persist canonical, mention, and provenance links +6. refine unresolved/provisional entities separately + +#### Recommended insertion point + +The best insertion point is **between extraction and final refinement**. + +Conceptually: + +- `batch_extract_titles(...)` / `batch_extract_kg(...)` +- ontology alignment step +- `advanced_kg_refiner(...)` or `basic_kg_refiner(...)` +- `refine_entities()` / `refine_relation()` + +#### Practical behavior + +For each extracted entity, the builder should: + +1. normalize the extracted name +2. attempt exact alias match against ontology +3. attempt embedding match against ontology name + description + aliases +4. use LLM disambiguation only when multiple ontology candidates remain plausible +5. mark the result as: + - matched to canonical ontology entity, + - ambiguous, + - or provisional + +#### Minimal-change implementation path + +If minimal disruption is preferred, `kg_builder.py` can first adopt a lightweight approach: + +- replace matched extracted entity names/types with canonical ontology values before passing them into the current refiner +- keep unmatched entities in the existing extracted-entity flow +- attach mapping metadata for later use + +This gives immediate value without requiring a full graph schema redesign in the first phase. + +#### Better long-term implementation path + +The stronger design is to create explicit: + +- canonical ontology nodes +- mention nodes +- mention-to-canonical links + +That better matches the architecture implied by the user proposal. + +### 18.2 Integration into `Core/Index/Graph.py` + +`Graph.py` is the most important structural change point because the current `Entity` model only stores: + +- `entity_name` +- `entity_type` +- `description` +- `source_ids` + +That is not enough to model canonical ontology entities and document-level mentions separately. + +#### Recommended graph model extension + +The graph layer should distinguish at least three node roles: + +1. `canonical` +2. `mention` +3. `provisional` + +The simplest extension is to add metadata fields to `Entity`, for example: + +- `entity_id` +- `entity_role` +- `canonical_id` +- `aliases` +- `mapping_confidence` +- `ontology_source` +- `status` + +An even better design is to add dedicated models such as: + +- `OntologyEntity` +- `MentionEntity` +- `OntologyRelationRule` + +but the metadata-extension approach is the least disruptive to the current code. + +#### Node identity recommendation + +Today, node identity is derived from: + +- `Name: {entity_name}` +- `Type: {entity_type}` + +That is too weak if canonical nodes and mention nodes may share the same name and type. + +The safer design is to move toward explicit IDs such as: + +- canonical node name based on `ontology_id` +- mention node name based on `doc_id + tree_node_id + mention_text` + +This prevents collisions and makes provenance cleaner. + +#### New helper methods to add + +The graph layer would benefit from helpers such as: + +- `add_canonical_entity(...)` +- `add_mention_entity(...)` +- `link_mention_to_canonical(...)` +- `get_mentions_for_canonical(...)` +- `resolve_to_canonical(...)` + +#### `tree2kg` recommendation + +Currently `tree2kg` maps tree nodes directly to graph nodes. + +With ontology support, the preferred behavior is: + +- tree nodes link first to mention nodes +- mention nodes link to canonical nodes + +This preserves exact document provenance while still allowing canonical retrieval. + +#### Global graph recommendation + +`save_to_global_graph()` already contains the right intuition in its docstring, but the current implementation still creates a self-referential `HAS_MENTION` edge. + +With ontology support, global graph persistence should instead store: + +- one canonical node per ontology entity +- document-scoped mention nodes or mention records +- explicit `HAS_MENTION` or `MENTIONED_IN` links across them + +This would make tenant-wide graph consolidation much more meaningful. + +#### Phase 2 explicit mention-to-canonical node model + +Once Phase 1 normalization is stable, BookRAG should move from metadata-only canonicalization to explicit graph structure. + +Recommended node types: + +- `CanonicalEntity` +- `MentionEntity` +- `EvidenceNode` (existing `DocumentTree` node reference) + +Recommended identifiers: + +- canonical node ID = `ontology_id` +- mention node ID = `doc_id + tree_node_id + mention_text + entity_type` + +Recommended edges: + +- `MENTION_OF`: `MentionEntity -> CanonicalEntity` +- `MENTIONED_IN`: `MentionEntity -> EvidenceNode` +- `CO_OCCURS_WITH` or extracted semantic relations between mention nodes + +Recommended retrieval behavior: + +1. resolve query entity to canonical node +2. expand canonical node to mention nodes +3. collect grounded `DocumentTree` evidence from mention nodes +4. optionally project mention-level relations back into a canonical summary view + +Migration path: + +- keep current canonical metadata fields for backward compatibility +- add mention nodes during indexing without removing current canonical nodes +- update retrieval to expand canonical nodes through `MENTION_OF` +- later make global graph persistence canonical-first and mention-aware + +### 18.3 Integration into `Core/rag/gbc_rag.py` + +`gbc_rag.py` is the main query-time resolution path. + +At present: + +- `_get_query_entity(...)` retrieves candidate entities from the entity VDB and LLM extraction +- `_entity_map(...)` maps extracted query entities to current graph node names +- `link_tree_node(...)` maps graph nodes back to tree nodes +- `get_GBC_info(...)` retrieves subtree/subgraph context from those starting points + +#### Recommended query-time resolution flow + +The ontology-aware version should become: + +1. extract candidate entities from the query +2. resolve them against the ontology layer first +3. convert them into canonical node IDs +4. expand canonical nodes to mentions and linked tree nodes +5. run the existing BookRAG subtree/subgraph retrieval flow + +This means the ontology layer should become the **first resolver**, not the final retriever. + +#### Changes to `_get_query_entity(...)` + +This method should first query an ontology-aware index built from: + +- canonical names +- aliases +- descriptions + +Only if ontology resolution fails should it fall back to the current entity VDB logic. + +This preserves BookRAG's current flexibility while improving precision when ontology coverage exists. + +#### Changes to `_entity_map(...)` + +Instead of mapping only to extracted graph nodes, `_entity_map(...)` should prefer canonical node targets. + +That means the returned mapping should ideally point to: + +- canonical ontology node names first +- provisional/extracted nodes only as fallback + +#### Changes to `link_tree_node(...)` + +If canonical nodes are introduced, `link_tree_node(...)` should expand canonical nodes through mention links before counting supporting tree nodes. + +Conceptually: + +- canonical node -> mention nodes -> `tree2kg` / source tree nodes + +This keeps the evidence path explainable. + +#### Changes to `get_GBC_info(...)` + +The downstream retrieval logic can stay mostly the same. + +Once canonical start entities have been expanded into tree nodes and subgraphs, BookRAG's existing strengths remain valid: + +- section selection +- subtree retrieval +- skyline filtering +- multimodal answer synthesis + +This is why ontology should be added as a **resolution layer**, not as a replacement for the current runtime design. + +### 18.4 Supporting configuration changes + +Although the main integration points are the three files above, the design will work better if configuration eventually adds an ontology section such as: + +- `ontology.enabled` +- `ontology.store_type` +- `ontology.path` or collection name +- `ontology.mapping_threshold` +- `ontology.allow_provisional_entities` +- `ontology.use_llm_disambiguation` + +This would keep ontology behavior explicit and tenant-configurable. + +## 19. Recommended Rollout Strategy + +To reduce risk, the best rollout is phased. + +### Phase 1: Canonical normalization only + +- load ontology during indexing +- map extracted entities to canonical names/types where confidence is high +- keep current graph structure mostly unchanged + +### Phase 2: Explicit mention and canonical nodes + +- extend `Graph.py` +- preserve mention provenance separately from canonical identity +- improve global graph persistence + +### Phase 3: Ontology-first query resolution + +- resolve user query entities against ontology before standard entity VDB +- expand canonical entities into mention and tree evidence +- surface canonical reasoning in retrieval traces + +### Phase 4: Ontology-governed relation refinement + +- validate extracted relations against ontology rules +- reject or downgrade invalid relation candidates +- improve cross-document graph consistency + +## 20. Final Recommendation on the User Proposal + +As an expert recommendation, the user proposal should be adopted in this refined form: + +- allow users to define ontology entities with descriptions +- use those entities as the canonical base layer during indexing +- keep extraction open enough to preserve new or unmatched entities +- resolve both indexed entities and query entities against the ontology first when possible + +In short: + +- **yes** to user-defined ontology entities as the base layer +- **yes** to using descriptions as part of matching and retrieval +- **no** to a fully closed ontology-only extraction model + +That design is the best balance between BookRAG's existing tree-first strengths and the canonical control offered by FEA-style ontology architecture. \ No newline at end of file diff --git a/docs/ontology-usage-guide.md b/docs/ontology-usage-guide.md new file mode 100644 index 0000000..45a06cf --- /dev/null +++ b/docs/ontology-usage-guide.md @@ -0,0 +1,49 @@ +## Ontology usage guide + +Use ontology entities when you want BookRAG to normalize extracted mentions onto stable canonical entities during indexing and retrieval. + +### Inline ontology config + +Set `ontology.enabled: true` and define entities directly in the config file. + +- `ontology_id`: stable canonical identifier +- `canonical_name`: normalized entity name stored in the graph +- `entity_type`: type used during matching +- `aliases`: alternate spellings and surface forms +- `keywords`: optional matching hints for domain terminology + +Example fields: + +- `mapping_threshold`: stricter values reduce false matches +- `allow_provisional_entities`: keep unmatched entities in the graph when `true` +- `use_query_resolution`: resolve query mentions to ontology-backed graph nodes first + +### File-backed ontology config + +You can also point `ontology.path` at a YAML or JSON file. Relative paths are resolved from the config file location. Inline entities and file-backed entities are merged by `ontology_id`. + +### Phase 3 tenant/global resolution + +Phase 3 is now controlled by `entity_resolution` config: + +- `enabled`: turns tenant/global canonical resolution on or off +- `similarity_threshold`: vector similarity gate for reuse of an existing tenant-global entity +- `top_k`: number of nearest global candidates to inspect +- `global_vdb_dir`: directory for the tenant-global ChromaDB store +- `collection_name`: ChromaDB collection name for global entities +- `canonical_only`: only export ontology-backed/canonical entities when `true` +- `sync_to_global_graph`: also sync to tenant-global FalkorDB when `true` + +Relative `global_vdb_dir` values are resolved from the config file location. + +### Recommended starting config + +- keep `ontology.enabled: true` +- start with `mapping_threshold: 1.0` +- keep `allow_provisional_entities: true` +- keep `entity_resolution.enabled: false` until you want cross-document tenant normalization +- once enabled, start with `canonical_only: true` if your ontology is strong and curated + +### Current limitation + +Phase 3 currently uses vector similarity plus canonical metadata persistence. It is intentionally conservative and does not yet implement a full mention-node merge model in the tenant-global graph. \ No newline at end of file diff --git a/docs/research-behavior-detection.md b/docs/research-behavior-detection.md new file mode 100644 index 0000000..8dad528 --- /dev/null +++ b/docs/research-behavior-detection.md @@ -0,0 +1,2309 @@ +# Video-Based Behavior Detection System — Research Documentation + +## Research Overview + +**Project:** Real-Time Surveillance Behavior Analysis System +**Target Behaviors:** Fighting (*berkelahi*), Stealing (*mencuri*), Mass Brawls (*tawuran*) +**Date:** March 2026 +**Hardware Target:** NVIDIA RTX 3060 (12 GB VRAM) + +--- + +## 1. Problem Statement + +Traditional CCTV surveillance relies on human operators monitoring multiple feeds — an approach that is error-prone, expensive, and unable to scale. This research evaluates state-of-the-art real-time object detection and pose estimation models for automated detection of three critical behaviors: + +| Behavior | Indonesian Term | Detection Challenge | Typical Duration | +|----------|----------------|---------------------|------------------| +| Fighting | *Berkelahi* | Aggressive body movements between 2+ persons | 5–30 seconds | +| Stealing | *Mencuri* | Subtle hand–object interactions, concealment | 10–60 seconds | +| Mass Brawls | *Tawuran* | Dense crowds, many overlapping persons, weapons | 1–10 minutes | + +### Requirements + +- **Real-time processing:** ≥5 FPS per camera stream (acceptable for surveillance) +- **Deterministic latency:** Consistent inference time regardless of crowd density +- **Single-GPU deployment:** All models must fit within 12 GB VRAM (RTX 3060) +- **Multi-camera support:** Target 1–2 simultaneous camera streams + +--- + +## 2. Models Evaluated + +### 2.1 YOLOv26 (Ultralytics — September 2025) + +YOLOv26 is a CNN-based, NMS-free, end-to-end real-time object detector developed by Ultralytics. It represents the latest evolution of the YOLO (You Only Look Once) family. + +#### Key Architectural Innovations + +| Innovation | Description | Impact | +|-----------|-------------|--------| +| **NMS-Free End-to-End** | Removes Non-Maximum Suppression post-processing entirely | Constant latency regardless of object count — critical for *tawuran* with dozens of people | +| **DFL Removal** | Eliminates Distribution Focal Loss (Softmax-heavy coordinate prediction) | 43% faster CPU inference; cleaner INT8/FP16 quantization for edge deployment | +| **MuSGD Optimizer** | Hybrid SGD + Muon optimizer (adapted from Moonshot AI's Kimi K2 LLM) | Faster training convergence, fewer epochs for fine-tuning | +| **STAL** | Small-Target-Aware Label Assignment with dynamic IoU thresholds | Better detection of small/distant persons in wide-angle CCTV | +| **RLE Pose** | Residual Log-Likelihood Estimation for keypoint localization | High-precision pose estimation under occlusion | + +#### Model Variants + +| Variant | Purpose | Key Capability | +|---------|---------|----------------| +| `yolo26{n,s,m,l,x}.pt` | Standard object detection | 80 COCO classes, NMS-free | +| `yolo26{s,m,l,x}-pose.pt` | Pose estimation | 17 keypoints per person (COCO format) | +| `yoloe-26{s,m,l,x}.pt` | Open-vocabulary detection | Zero-shot detection via text prompts | +| `yolo26{n,s,m,l,x}-seg.pt` | Instance segmentation | Pixel-level masks | + +#### Usage + +```python +from ultralytics import YOLO + +# Standard detection +model = YOLO("yolo26s.pt") +results = model("frame.jpg") + +# Pose estimation (for fighting/tawuran) +model = YOLO("yolo26s-pose.pt") +results = model("frame.jpg") + +# Open-vocabulary (for stealing — zero-shot) +model = YOLO("yoloe-26s.pt") +model.set_classes(["person concealing object", "hand reaching into bag"]) +results = model("frame.jpg") +``` + +### 2.2 RF-DETR (Roboflow — March 2025, ICLR 2026) + +RF-DETR is a real-time transformer-based object detector developed by Roboflow. It is the first real-time model to achieve 60+ mAP on COCO and is built on a DINOv2 Vision Transformer backbone. + +#### Key Architectural Features + +| Feature | Description | Impact | +|---------|-------------|--------| +| **DINOv2 ViT Backbone** | Self-supervised Vision Transformer with global self-attention | Superior feature extraction; better understanding of spatial relationships | +| **Deformable Attention Decoder** | Attends to relevant image regions adaptively | Better handling of occluded and small objects | +| **NMS-Free** | End-to-end detection without post-processing | Deterministic latency (same benefit as YOLOv26) | +| **Neural Architecture Search** | Automated model design optimization | Optimized accuracy–latency trade-offs at every model size | +| **Fine-tuning Focused** | Designed for transfer learning on custom datasets | Proven generalization on RF100-VL (100 diverse domains) | + +#### Model Variants + +| Size | Class | COCO AP₅₀:₉₅ | Latency (ms) | Params (M) | Resolution | License | +|------|-------|--------------|--------------|------------|------------|---------| +| Nano | `RFDETRNano` | 48.4 | 2.3 | 30.5 | 384×384 | Apache 2.0 | +| Small | `RFDETRSmall` | 53.0 | 3.5 | 32.1 | 512×512 | Apache 2.0 | +| Medium | `RFDETRMedium` | 54.7 | 4.4 | 33.7 | 576×576 | Apache 2.0 | +| Large | `RFDETRLarge` | 56.5 | 6.8 | 33.9 | 704×704 | Apache 2.0 | +| XLarge | `RFDETRXLarge` | 58.6 | 11.5 | 126.4 | 700×700 | PML 1.0 | +| 2XLarge | `RFDETR2XLarge` | 60.1 | 17.2 | 126.9 | 880×880 | PML 1.0 | + +#### Usage + +```python +from rfdetr import RFDETRMedium +from rfdetr.util.coco_classes import COCO_CLASSES + +model = RFDETRMedium() +detections = model.predict(image, threshold=0.5) +``` + +--- + +## 3. Performance Benchmarks + +All benchmarks measured on **NVIDIA T4 GPU, TensorRT FP16, batch size 1**. Latency is "Total Latency" including all post-processing. + +### 3.1 Object Detection — COCO val2017 + +| Model | COCO AP₅₀ | COCO AP₅₀:₉₅ | Latency (ms) | Params (M) | Resolution | +|-------|-----------|--------------|--------------|------------|------------| +| RF-DETR-N | **67.6** | **48.4** | 2.3 | 30.5 | 384×384 | +| RF-DETR-S | **72.1** | **53.0** | 3.5 | 32.1 | 512×512 | +| RF-DETR-M | **73.6** | **54.7** | 4.4 | 33.7 | 576×576 | +| RF-DETR-L | **75.1** | **56.5** | 6.8 | 33.9 | 704×704 | +| RF-DETR-XL | **77.4** | **58.6** | 11.5 | 126.4 | 700×700 | +| RF-DETR-2XL | **78.5** | **60.1** | 17.2 | 126.9 | 880×880 | +| YOLO26-N | 55.8 | 40.3 | **1.7** | **2.6** | 640×640 | +| YOLO26-S | 64.3 | 47.7 | **2.6** | **9.4** | 640×640 | +| YOLO26-M | 69.7 | 52.5 | **4.4** | **20.1** | 640×640 | +| YOLO26-L | 71.1 | 54.1 | **5.7** | **25.3** | 640×640 | +| YOLO26-X | 74.0 | 56.9 | **9.6** | **56.9** | 640×640 | +| YOLO11-N | 52.0 | 37.4 | 2.5 | 2.6 | 640×640 | +| YOLO11-S | 59.7 | 44.4 | 3.2 | 9.4 | 640×640 | +| YOLO11-M | 64.1 | 48.6 | 5.1 | 20.1 | 640×640 | +| YOLO11-L | 64.9 | 49.9 | 6.5 | 25.3 | 640×640 | +| YOLO11-X | 66.1 | 50.9 | 10.5 | 56.9 | 640×640 | +| D-FINE-S | 67.6 | 50.6 | 3.5 | 10.2 | 640×640 | +| D-FINE-M | 72.6 | 55.0 | 5.4 | 19.2 | 640×640 | +| D-FINE-L | 74.9 | 57.2 | 7.5 | 31.0 | 640×640 | +| LW-DETR-S | 66.8 | 48.0 | 2.6 | 14.6 | 640×640 | +| LW-DETR-M | 72.0 | 52.6 | 4.4 | 28.2 | 640×640 | +| LW-DETR-L | 74.6 | 56.1 | 6.9 | 46.8 | 640×640 | + +### 3.2 Real-World Domain Adaptability — RF100-VL + +| Model | RF100-VL AP₅₀ | RF100-VL AP₅₀:₉₅ | +|-------|---------------|-------------------| +| RF-DETR-N | **85.0** | **57.7** | +| RF-DETR-S | **86.7** | **60.2** | +| RF-DETR-M | **87.4** | **61.2** | +| RF-DETR-L | **88.2** | **62.2** | +| YOLO26-S | 82.7 | 57.0 | +| YOLO26-M | 84.4 | 58.7 | +| YOLO26-L | 85.0 | 59.3 | +| YOLO26-X | 85.6 | 60.0 | +| YOLO11-N | 81.4 | 55.3 | +| YOLO11-S | 82.3 | 56.2 | +| YOLO11-M | 82.5 | 56.5 | + +### 3.3 Instance Segmentation — COCO val2017 + +| Model | COCO AP₅₀ | COCO AP₅₀:₉₅ | Latency (ms) | Params (M) | +|-------|-----------|--------------|--------------|------------| +| RF-DETR-Seg-N | **63.0** | **40.3** | 3.4 | 33.6 | +| RF-DETR-Seg-S | **66.2** | **43.1** | 4.4 | 33.7 | +| RF-DETR-Seg-M | **68.4** | **45.3** | 5.9 | 35.7 | +| YOLO26-N-Seg | 54.3 | 34.7 | **2.31** | **2.7** | +| YOLO26-S-Seg | 62.4 | 40.2 | **3.47** | **10.4** | +| YOLO26-M-Seg | 67.8 | 44.0 | **6.32** | **23.6** | +| YOLO11-N-Seg | 47.8 | 30.0 | 3.6 | 2.9 | +| YOLO11-S-Seg | 55.4 | 35.0 | 4.6 | 10.1 | +| YOLO11-M-Seg | 60.0 | 38.5 | 6.9 | 22.4 | + +### 3.4 Analysis Summary + +| Metric | Winner | Margin | +|--------|--------|--------| +| **COCO Accuracy (all sizes)** | RF-DETR | +2.2 to +8.1 mAP₅₀:₉₅ | +| **RF100-VL Domain Generalization** | RF-DETR | +1.2 to +3.2 mAP₅₀:₉₅ | +| **Inference Speed (all sizes)** | YOLOv26 | 16–26% faster | +| **Model Size / Memory** | YOLOv26 | 1.7x–12x fewer parameters | +| **Segmentation Accuracy** | RF-DETR | +0.1 to +5.6 mAP₅₀:₉₅ | +| **YOLOv26 vs YOLO11 (same arch)** | YOLOv26 | +2.9 to +6.0 mAP₅₀:₉₅, 26–43% faster | + +--- + +## 4. Behavior Detection Strategy + +### 4.1 Fighting (*Berkelahi*) — Pose-Based Detection ✅ Easiest + +**Primary Model:** YOLOv26-Pose + +Fighting detection relies on **skeletal keypoint analysis** across consecutive frames. Aggressive actions produce distinctive pose signatures. + +**Detection Signals:** +- Rapid limb acceleration (punching, kicking) +- Close proximity between two or more persons (< 1 meter) +- Asymmetric body postures (one person lunging, other retreating) +- Keypoint velocity exceeding threshold over 5–10 frame window + +**Keypoints Used (COCO 17-point format):** +- Wrists (IDs 9, 10) — punch/grab detection +- Ankles (IDs 15, 16) — kick detection +- Shoulders/Hips (IDs 5, 6, 11, 12) — body orientation and proximity + +**Why YOLOv26-Pose:** +- Built-in RLE-based keypoint estimation — no separate model needed +- NMS-free ensures consistent detection even when fighters overlap +- 2.6 ms latency allows real-time analysis at 30+ FPS +- Pre-trained AVA dataset actions (push, hit, kick, punch) available via SlowFast for confirmation + +### 4.2 Stealing (*Mencuri*) — Context-Aware Detection ⚠️ Hardest + +**Primary Model:** YOLOE-26 (zero-shot first pass) + RF-DETR (fine-tuned confirmation) + +Stealing is the most challenging behavior because it involves **subtle hand–object interactions** that are visually similar to normal activities. + +**Detection Strategy (Multi-Stage):** + +| Stage | Model | Role | +|-------|-------|------| +| 1. Object proximity | YOLOv26 | Detect persons near high-value objects (shelves, bags, displays) | +| 2. Zero-shot screening | YOLOE-26 | Text-prompted detection: `"person concealing object"`, `"hand reaching into bag"` | +| 3. Temporal analysis | SlowFast / X3D | Classify 2–4 second video clips for theft-specific motion patterns | +| 4. Fine-tuned confirmation | RF-DETR (custom) | Fine-tuned on labeled theft dataset for high-precision detection | + +**Why Multi-Stage:** +- No single model reliably detects stealing out-of-the-box +- YOLOE-26 provides zero-shot capability for rapid prototyping without labeled data +- RF-DETR excels at fine-tuning on small custom datasets (proven on RF100-VL) +- SlowFast adds temporal context that frame-level detectors cannot capture + +**Custom Dataset Requirements:** +- Minimum 500–1,000 labeled video clips showing theft behaviors +- Include diverse scenarios: shoplifting, pickpocketing, bag theft +- Negative samples: normal shopping, browsing, reaching for own items + +### 4.3 Mass Brawls (*Tawuran*) — Crowd Density Analysis ✅ Moderate + +**Primary Model:** YOLOv26 (detection + pose) + +Tawuran detection combines **crowd density analysis** with **collective motion patterns**. + +**Detection Signals:** +- Person count exceeding threshold in defined region (e.g., >15 persons in 50m²) +- Collective rapid movement in opposing directions (convergence pattern) +- Multiple aggressive pose signatures detected simultaneously +- Optional: weapon detection via YOLOE-26 text prompts (`"person with stick"`, `"person with weapon"`) + +**Why NMS-Free Architecture is Critical:** +Traditional NMS-based models (YOLO11 and earlier) suppress overlapping bounding boxes — in a crowd of 30+ people, this causes: +- **Missed detections** (valid persons suppressed as duplicates) +- **Variable latency** (NMS processing time scales with object count) + +Both YOLOv26 and RF-DETR are NMS-free, providing: +- **Constant inference time** regardless of crowd size +- **No suppression errors** — every person is detected independently + +### 4.4 Comparison: Which Model for Which Behavior? + +| Behavior | Best Model | Why | Accuracy | Speed | +|----------|-----------|-----|----------|-------| +| **Fighting** | YOLOv26-Pose | Built-in keypoint estimation; real-time pose analysis | High | Very Fast | +| **Stealing (prototype)** | YOLOE-26 | Zero-shot via text prompts; no labeled data needed | Moderate | Fast | +| **Stealing (production)** | RF-DETR (fine-tuned) | Best fine-tuning performance; highest detection accuracy | Very High | Fast | +| **Tawuran (detection)** | YOLOv26 | Lightweight; handles dense crowds; NMS-free | High | Very Fast | +| **Tawuran (confirmation)** | RF-DETR | Superior accuracy for counting persons in dense scenes | Very High | Fast | + +--- + +## 5. System Architecture + +### 5.1 Hybrid Pipeline Design + +The system uses a **two-tier architecture** where a lightweight model runs continuously (Tier 1) and a heavier model is triggered on-demand (Tier 2). + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ CCTV Camera Feed │ +└──────────────────────────────┬──────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ TIER 1: Always-On (YOLOv26-Pose, ~2 ms/frame) │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────────┐ │ +│ │ Person │ │ Pose │ │ Behavior Trigger │ │ +│ │ Detection │──│ Estimation │──│ • Fighting → ALERT │ │ +│ │ (bbox) │ │ (17 keypts) │ │ • Crowd density → Tier 2 │ │ +│ └──────────────┘ └──────────────┘ │ • Suspicious → Tier 2 │ │ +│ └──────────────────────────┘ │ +└──────────────────────────────┬──────────────────────────────────────┘ + │ (triggered only when needed) + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ TIER 2: On-Demand (RF-DETR / SlowFast / YOLOE-26) │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────────┐ │ +│ │ RF-DETR-M │ │ SlowFast / │ │ YOLOE-26 │ │ +│ │ (fine-tuned) │ │ X3D-S │ │ (open-vocabulary) │ │ +│ │ Stealing │ │ Temporal │ │ Text-prompted │ │ +│ │ confirmation │ │ analysis │ │ zero-shot detection │ │ +│ └──────────────┘ └──────────────┘ └──────────────────────────┘ │ +└──────────────────────────────┬──────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ ALERT SYSTEM │ +│ • Classification: fighting / stealing / tawuran │ +│ • Confidence score │ +│ • Bounding box / region of interest │ +│ • Video clip extraction for review │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### 5.2 Processing Pipeline — Step by Step + +| Step | Component | Model | Runs | Latency | +|------|-----------|-------|------|---------| +| 1 | Frame capture | — | Every frame | <1 ms | +| 2 | Person detection + pose | YOLOv26-Pose-S | Every 2nd–3rd frame | ~2.6 ms | +| 3 | Person tracking | ByteTrack | Every frame | <1 ms | +| 4 | Pose analysis (fighting) | Rule-based / ML classifier | Every frame with persons | <1 ms | +| 5 | Crowd density check (tawuran) | Person count per ROI | Every frame | <1 ms | +| 6 | Suspicious activity flag | Trigger logic | On threshold breach | <1 ms | +| 7a | Stealing confirmation | RF-DETR-M (fine-tuned) | On-demand | ~4.4 ms | +| 7b | Temporal behavior analysis | SlowFast / X3D-S | On-demand (2–4s clip) | ~50–100 ms | +| 7c | Zero-shot screening | YOLOE-26 | On-demand | ~2.6 ms | +| 8 | Alert dispatch | Alert system | On confirmed detection | <1 ms | + +### 5.3 Implementation Example + +```python +from ultralytics import YOLO +import numpy as np + +# ── Tier 1: Always-on detection ────────────────────────────────────── +pose_model = YOLO("yolo26s-pose.pt") +tracker = ByteTrack() # or DeepSORT + +def process_frame(frame): + # Step 1: Detect persons + keypoints + results = pose_model(frame, conf=0.5, classes=[0]) # class 0 = person + + # Step 2: Track persons across frames + detections = results[0].boxes + keypoints = results[0].keypoints + tracked = tracker.update(detections) + + # Step 3: Analyze behaviors + alerts = [] + + # Fighting detection: check for aggressive pose patterns + for person_a, person_b in get_close_pairs(tracked, threshold_meters=1.5): + if detect_fighting_pose(person_a.keypoints, person_b.keypoints): + alerts.append({ + "type": "fighting", + "confidence": calculate_confidence(person_a, person_b), + "persons": [person_a.id, person_b.id] + }) + + # Tawuran detection: crowd density check + person_count = len(tracked) + if person_count > 15: # threshold for mass gathering + alerts.append({ + "type": "tawuran_warning", + "person_count": person_count, + "trigger": "crowd_density" + }) + + # Trigger Tier 2 for suspicious activities + for alert in alerts: + if alert["type"] in ["suspicious_proximity", "tawuran_warning"]: + trigger_tier2(frame, alert) + + return alerts + +# ── Tier 2: On-demand confirmation ────────────────────────────────── +from rfdetr import RFDETRMedium + +rfdetr_model = RFDETRMedium() # fine-tuned on stealing dataset +yoloe_model = YOLO("yoloe-26s.pt") + +def trigger_tier2(frame, alert): + if alert["type"] == "suspicious_proximity": + # Zero-shot stealing detection + yoloe_model.set_classes([ + "person concealing object", + "hand reaching into bag", + "person hiding item under clothing" + ]) + results = yoloe_model(frame) + + # High-accuracy confirmation + detections = rfdetr_model.predict(frame, threshold=0.6) + + elif alert["type"] == "tawuran_warning": + # Accurate person count with RF-DETR + detections = rfdetr_model.predict(frame, threshold=0.4) + accurate_count = len(detections) +``` + +--- + +## 6. Hardware & Deployment Recommendations + +### 6.1 VRAM Budget (RTX 3060 — 12 GB) + +| Component | VRAM Usage (FP16) | Notes | +|-----------|-------------------|-------| +| YOLOv26-S-Pose | ~0.8 GB | Always loaded | +| ByteTrack | ~0.1 GB | CPU-based, minimal GPU | +| RF-DETR-M (on-demand) | ~2.5 GB | Loaded/unloaded as needed | +| YOLOE-26-S (on-demand) | ~1.0 GB | Shares backbone with YOLOv26 | +| SlowFast / X3D-S (on-demand) | ~1.5 GB | Loaded only for temporal analysis | +| **Total (worst case)** | **~5.9 GB** | Well within 12 GB budget | + +### 6.2 Throughput Estimates (RTX 3060) + +| Configuration | FPS | Cameras | Use Case | +|--------------|-----|---------|----------| +| YOLOv26-S-Pose only | ~120 FPS | 4–6 streams at 20 FPS | Fighting + tawuran only | +| YOLOv26-S-Pose + ByteTrack | ~80 FPS | 2–4 streams at 20 FPS | Full tracking pipeline | +| Hybrid (Tier 1 + Tier 2 on-demand) | ~30–60 FPS | 1–2 streams at 15 FPS | All 3 behaviors | +| RF-DETR-M continuous | ~45 FPS | 1–2 streams at 15 FPS | Maximum accuracy mode | + +### 6.3 Optimization Techniques + +| Technique | Impact | How | +|-----------|--------|-----| +| **TensorRT export** | 2–3x speedup | `model.export(format="engine", half=True)` | +| **FP16 inference** | ~40% less VRAM | Minimal accuracy loss (<0.1 mAP) | +| **Frame skipping** | 2–3x throughput | Process every 2nd or 3rd frame; sufficient for surveillance | +| **ROI cropping** | Reduced compute | Only process regions of interest, not full frame | +| **Dynamic model loading** | Lower peak VRAM | Load RF-DETR/SlowFast only when triggered | +| **INT8 quantization** | Further speedup | YOLOv26 DFL-free design makes INT8 cleaner | + +### 6.4 Deployment Recommendations by Budget + +| Budget | Hardware | Models | Cameras | Behaviors | +|--------|----------|--------|---------|-----------| +| **Low (~$300)** | 1× RTX 3060 | YOLOv26-Pose only | 1–2 | Fighting, tawuran | +| **Medium (~$500)** | 1× RTX 3080 | YOLOv26-Pose + RF-DETR-M | 1–2 | All 3 behaviors | +| **High (~$1,600)** | 1× RTX 4090 | Full hybrid pipeline | 3–4 | All 3, high accuracy | +| **Production** | 2× RTX 3060 or Cloud T4 | Dedicated per-tier GPUs | 4+ | All 3, redundancy | + +### 6.5 Edge Deployment (Jetson / CPU) + +| Aspect | YOLOv26 | RF-DETR | +|--------|---------|---------| +| **Jetson Orin Nano** | ✅ Excellent (2.6M params, DFL-free) | ⚠️ Heavy (30M+ params, ViT) | +| **CPU-only** | ✅ 43% faster than YOLO11 on CPU | ❌ Not recommended (ViT too slow) | +| **INT8 quantization** | ✅ Clean (no DFL Softmax) | ⚠️ ViT quantization more complex | +| **ONNX/TensorRT export** | ✅ Full support | ✅ Full support | +| **CoreML (Apple)** | ✅ Supported | ⚠️ In development | + +**Edge recommendation:** Use YOLOv26-Pose exclusively on edge devices. Reserve RF-DETR for server/cloud deployment where GPU resources are available. + +--- + +## 7. Conclusions & Recommendations + +### 7.1 Model Selection Summary + +| | YOLOv26 | RF-DETR | +|--|---------|---------| +| **Architecture** | CNN (lightweight, efficient) | Transformer (accurate, heavy) | +| **Strength** | Speed, model size, pose estimation, open-vocab | Accuracy, fine-tuning, domain generalization | +| **Weakness** | Lower accuracy than RF-DETR | No pose estimation, heavier model | +| **License** | AGPL-3.0 (restrictive for commercial) | Apache 2.0 (permissive) | +| **Best role** | Always-on primary detector (Tier 1) | On-demand high-accuracy confirmer (Tier 2) | + +### 7.2 Final Recommendation + +**Use a hybrid approach:** + +1. **YOLOv26-Pose** as the always-on Tier 1 detector for fighting and tawuran — it provides built-in pose estimation, NMS-free deterministic latency, and the lightest resource footprint. + +2. **YOLOE-26** for zero-shot stealing detection during prototyping — enables rapid iteration without labeled data. + +3. **RF-DETR** (fine-tuned) as the Tier 2 high-accuracy confirmer for stealing and ambiguous cases — its superior accuracy (+2–8 mAP) and proven fine-tuning capability on custom datasets make it ideal for production-quality behavior classification. + +4. **SlowFast / X3D** for temporal behavior analysis when frame-level detection is insufficient — particularly for stealing where the action unfolds over multiple seconds. + +This hybrid approach maximizes both **speed** (YOLOv26 strengths) and **accuracy** (RF-DETR strengths) while staying within a single RTX 3060's 12 GB VRAM budget. + +### 7.3 Recommended Development Roadmap + +| Phase | Duration | Goal | Models | +|-------|----------|------|--------| +| **Phase 1: Prototype** | 2–4 weeks | Detect fighting with pre-trained models | YOLOv26-Pose | +| **Phase 2: Expand** | 2–4 weeks | Add tawuran detection; test YOLOE-26 for stealing | YOLOv26-Pose + YOLOE-26 | +| **Phase 3: Custom Data** | 4–8 weeks | Collect and label stealing dataset (500+ clips) | Data collection | +| **Phase 4: Fine-tune** | 2–4 weeks | Fine-tune RF-DETR on custom stealing dataset | RF-DETR-M | +| **Phase 5: Integration** | 2–4 weeks | Build full hybrid pipeline with alert system | All models | +| **Phase 6: Production** | 2–4 weeks | TensorRT optimization, monitoring, deployment | Optimized pipeline | + +--- + +## References + +1. **YOLOv26** — Ultralytics (September 2025). NMS-free, DFL-free real-time object detection. https://docs.ultralytics.com/models/yolo26/ +2. **RF-DETR** — Roboflow (March 2025, ICLR 2026). Real-time detection transformer with DINOv2 backbone. https://github.com/roboflow/rf-detr +3. **SlowFast Networks** — Feichtenhofer et al. (ICCV 2019). Dual-pathway temporal action recognition. https://github.com/facebookresearch/SlowFast +4. **X3D** — Feichtenhofer (CVPR 2020). Efficient video recognition networks. Part of SlowFast repository. +5. **ByteTrack** — Zhang et al. (ECCV 2022). Multi-object tracking by associating every detection box. https://github.com/ifzhang/ByteTrack +6. **DINOv2** — Oquab et al. (2023). Self-supervised Vision Transformer features. https://github.com/facebookresearch/dinov2 +7. **COCO Dataset** — Lin et al. (2014). Microsoft Common Objects in Context. https://cocodataset.org +8. **RF100-VL** — Roboflow. 100 diverse real-world detection datasets. https://github.com/roboflow/rf100-vl +9. **AVA Dataset** — Gu et al. (CVPR 2018). Atomic Visual Actions for action detection. + +--- +--- + +# Part II — Pipeline Architecture Analysis & Integration Guide + +**Date:** March 2026 +**Hardware:** 2× NVIDIA V100 32 GB +**Target:** Multi-camera CCTV surveillance with behavior detection + +--- + +## 8. Current Pipeline Review + +### 8.1 Current Architecture + +``` +Thread per CCTV stream (capture frame at 1 FPS) + → Save frame in shared memory buffer + → Pool/batch frames for YOLO detection pipeline + → Get bounding boxes (object detection) + → Crop bounding box regions and save as images + → Pool/batch cropped images for async save to MongoDB +``` + +### 8.2 Identified Bottlenecks & Issues + +| # | Issue | Severity | Impact | +|---|-------|----------|--------| +| 1 | **Thread-per-stream scaling** | 🔴 Critical | Python's GIL prevents true parallelism. At 200+ streams, thread overhead dominates. Context switching between hundreds of threads wastes CPU cycles. | +| 2 | **Synchronous RTSP decode** | 🔴 Critical | FFmpeg/OpenCV decode is CPU-bound. Each 1080p decode uses ~0.3–0.5 CPU core. 500 streams = 150–250 cores just for decode. | +| 3 | **Shared memory buffer — no backpressure** | 🟡 High | If YOLO batching falls behind capture rate, frames accumulate unbounded in memory. No drop policy = OOM risk. | +| 4 | **Single YOLO model bottleneck** | 🟡 High | All streams funnel into one YOLO instance. If batch queue stalls, all streams back up. No priority or fairness. | +| 5 | **Crop → save as image → MongoDB** | 🟡 High | JPEG encoding is CPU-bound (~2–5 ms per crop). Saving to disk then re-reading for MongoDB is redundant I/O. | +| 6 | **No tracking across frames** | 🟠 Medium | Without person tracking (ByteTrack), same person is re-detected every frame. No temporal identity = no behavior analysis possible. | +| 7 | **No behavior analysis layer** | 🟠 Medium | Pipeline ends at "save crops." No pose analysis, no temporal analysis, no alert system. | +| 8 | **No GPU decode (NVDEC)** | 🟠 Medium | CPU decode wastes cores that could process more streams. V100 NVDEC engines sit idle. | + +### 8.3 Recommended Revised Architecture + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ CAPTURE LAYER (async) │ +│ │ +│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ +│ │ Camera 1 │ │ Camera 2 │ │ Camera 3 │ ... │ Camera N │ │ +│ │ RTSP pull │ │ RTSP pull │ │ RTSP pull │ │ RTSP pull │ │ +│ └─────┬─────┘ └─────┬─────┘ └─────┬─────┘ └─────┬─────┘ │ +│ │ │ │ │ │ +│ └──────────────┴──────┬───────┴────────────────────┘ │ +│ │ │ +│ ┌─────────▼──────────┐ │ +│ │ Ring Buffer Pool │ ← Fixed-size, per-camera │ +│ │ (drop-oldest) │ ← Backpressure: drop frames │ +│ └─────────┬──────────┘ │ +└──────────────────────────────┼──────────────────────────────────────────────┘ + │ +┌──────────────────────────────▼──────────────────────────────────────────────┐ +│ TIER 1: ALWAYS-ON (GPU 0) │ +│ │ +│ ┌──────────────────────────────────────────────────────┐ │ +│ │ Batch Assembler │ │ +│ │ • Collect frames from ring buffers │ │ +│ │ • Dynamic batch size (8–32 based on queue depth) │ │ +│ │ • Priority: cameras with recent alerts first │ │ +│ └──────────────────────┬───────────────────────────────┘ │ +│ │ │ +│ ┌──────────────────────▼───────────────────────────────┐ │ +│ │ YOLOv26-S-Pose (TensorRT FP16) │ │ +│ │ → Person bboxes + 17 keypoints per person │ │ +│ │ → ~1.5 ms/image on V100 │ │ +│ └──────────────────────┬───────────────────────────────┘ │ +│ │ │ +│ ┌──────────────────────▼───────────────────────────────┐ │ +│ │ ByteTrack (CPU) │ │ +│ │ → Track person IDs across frames │ │ +│ │ → Maintain per-person keypoint history (last 10 fr) │ │ +│ └──────────────────────┬───────────────────────────────┘ │ +│ │ │ +│ ┌──────────────────────▼───────────────────────────────┐ │ +│ │ Behavior Analyzer (CPU) │ │ +│ │ ├─ Fighting: keypoint velocity + proximity │ │ +│ │ ├─ Tawuran: crowd density + collective motion │ │ +│ │ └─ Suspicious: person-object proximity zones │ │ +│ └──────────┬────────────────────────┬──────────────────┘ │ +│ │ direct alert │ trigger Tier 2 │ +│ ▼ ▼ │ +│ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │ Alert Queue │ │ Tier 2 Request Queue │ │ +│ └──────┬──────┘ └────────────┬────────────┘ │ +└─────────────┼─────────────────────────┼─────────────────────────────────────┘ + │ │ + │ ┌────────────────────▼──────────────────────────────────┐ + │ │ TIER 2: ON-DEMAND (GPU 1) │ + │ │ │ + │ │ ┌─────────────┐ ┌─────────────┐ ┌───────────────┐ │ + │ │ │ RF-DETR-M │ │ SlowFast │ │ YOLOE-26 │ │ + │ │ │ (fine-tuned)│ │ X3D-S │ │ (open-vocab) │ │ + │ │ │ Stealing │ │ Temporal │ │ Zero-shot │ │ + │ │ │ ~2.7ms V100 │ │ ~50–80ms │ │ ~1.5ms V100 │ │ + │ │ └──────┬──────┘ └──────┬──────┘ └───────┬───────┘ │ + │ │ └───────────┬────┴───────────────┘ │ + │ │ ▼ │ + │ │ ┌──────────────────┐ │ + │ │ │ Confirmation │ │ + │ │ │ Aggregator │ │ + │ │ └────────┬─────────┘ │ + │ └───────────────────┼───────────────────────────────────┘ + │ │ + ▼ ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ OUTPUT LAYER │ +│ │ +│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ +│ │ MongoDB │ │ Alert │ │ Video Clip │ │ +│ │ (crops+meta) │ │ Dispatcher │ │ Extractor │ │ +│ │ Async bulk │ │ WebSocket/API │ │ (evidence) │ │ +│ └───────────────┘ └───────────────┘ └───────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### 8.4 Architectural Improvements — Detail + +#### A. Replace Thread-per-Stream with Producer-Consumer + asyncio + +**Problem:** Python threads hold the GIL; 500 threads = massive context-switch overhead. + +**Solution:** Use a small **process pool** for CPU-bound decode, feeding into an **asyncio** event loop for I/O coordination. + +```python +import asyncio +import multiprocessing as mp +from concurrent.futures import ProcessPoolExecutor +from collections import deque +import cv2 +import numpy as np + +# ── Ring buffer per camera (fixed-size, drop-oldest) ────────────────── +class CameraRingBuffer: + """Lock-free-ish ring buffer. Drops oldest frame when full.""" + def __init__(self, camera_id: str, max_size: int = 3): + self.camera_id = camera_id + self.buffer = deque(maxlen=max_size) # auto-drops oldest + self.frame_count = 0 + self.dropped = 0 + + def put(self, frame: np.ndarray, timestamp: float): + if len(self.buffer) == self.buffer.maxlen: + self.dropped += 1 + self.buffer.append((frame, timestamp, self.frame_count)) + self.frame_count += 1 + + def get_latest(self): + return self.buffer[-1] if self.buffer else None + +# ── RTSP decode in separate process (bypass GIL) ───────────────────── +def decode_worker(rtsp_url: str, shared_queue: mp.Queue, camera_id: str): + """Runs in a subprocess. Decodes RTSP at 1 FPS.""" + cap = cv2.VideoCapture(rtsp_url) + while cap.isOpened(): + ret, frame = cap.read() + if ret: + try: + shared_queue.put_nowait((camera_id, frame, time.time())) + except mp.queues.Full: + pass # backpressure: drop frame silently + time.sleep(1.0) # 1 FPS capture rate + +# ── Main async coordinator ─────────────────────────────────────────── +class PipelineCoordinator: + def __init__(self, camera_urls: dict[str, str]): + self.buffers: dict[str, CameraRingBuffer] = {} + self.frame_queue = mp.Queue(maxsize=1000) + self.decode_pool = ProcessPoolExecutor(max_workers=mp.cpu_count() // 2) + + for cam_id, url in camera_urls.items(): + self.buffers[cam_id] = CameraRingBuffer(cam_id, max_size=3) + + async def run(self): + # Start decode workers as subprocesses + for cam_id, url in camera_urls.items(): + self.decode_pool.submit(decode_worker, url, self.frame_queue, cam_id) + + # Main loop: drain queue → fill ring buffers → assemble batches + while True: + batch = self._assemble_batch(max_batch_size=32) + if batch: + results = await self._run_tier1(batch) + await self._process_results(results) + await asyncio.sleep(0.001) # yield control +``` + +#### B. Backpressure Strategy + +| Scenario | Action | Implementation | +|----------|--------|----------------| +| Frame queue full | **Drop newest frame** from that camera | `put_nowait` + catch `Full` | +| GPU batch queue full | **Drop oldest batch** (stale frames) | Ring buffer with `maxlen=3` per camera | +| Tier 2 queue full | **Drop lowest-confidence requests** | Priority queue sorted by confidence | +| MongoDB write slow | **Buffer in memory, bulk write** | Async bulk insert every 5 seconds | +| Network congestion | **Reduce capture FPS temporarily** | Adaptive sleep: `1.0 → 2.0s` | + +#### C. Eliminate Redundant I/O (Crop → Disk → MongoDB) + +**Current:** Crop → encode JPEG → save to disk → read from disk → upload to MongoDB +**Improved:** Crop → encode JPEG in-memory → bulk insert to MongoDB directly + +```python +import motor.motor_asyncio # async MongoDB driver +import cv2 + +async def save_crops_batch(crops: list[tuple[str, np.ndarray, dict]]): + """Bulk save crops directly to MongoDB GridFS — no disk I/O.""" + client = motor.motor_asyncio.AsyncIOMotorClient("mongodb://localhost:27017") + db = client.surveillance + + documents = [] + for camera_id, crop_img, metadata in crops: + _, jpeg_bytes = cv2.imencode(".jpg", crop_img, [cv2.IMWRITE_JPEG_QUALITY, 85]) + documents.append({ + "camera_id": camera_id, + "image": jpeg_bytes.tobytes(), # binary, no disk round-trip + "timestamp": metadata["timestamp"], + "bbox": metadata["bbox"], + "person_id": metadata.get("track_id"), + "behavior_flags": metadata.get("flags", []), + }) + + if documents: + await db.detections.insert_many(documents) # single bulk operation +``` + +--- + +## 9. YOLOv26-Pose Integration (Fighting & Tawuran) + +### 9.1 Integration Strategy: Replace, Don't Run Alongside + +**Replace** the current YOLO detection model with YOLOv26-S-Pose. It provides: +- Everything the current YOLO does (bounding boxes, class detection) +- **Plus** 17 keypoints per person (pose estimation) +- **Plus** NMS-free deterministic latency +- **Faster** than YOLO11 equivalent (~26–43% faster) + +There is no reason to run both — YOLOv26-Pose is a strict superset. + +### 9.2 Where Pose Analysis Fits in the Data Flow + +``` +Frame from ring buffer + │ + ▼ +YOLOv26-S-Pose inference (GPU, ~1.5 ms on V100) + │ + ├── outputs: bboxes + confidence + class + 17 keypoints per person + │ + ▼ +ByteTrack (CPU, <1 ms) + │ + ├── outputs: tracked person IDs + keypoint history per track + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ Behavior Analyzer (CPU, <1 ms total) │ +│ │ +│ ┌─────────────────────────────────────────────────┐ │ +│ │ A. Fighting Detector │ │ +│ │ • Input: keypoints of all close pairs │ │ +│ │ • Method: velocity + acceleration of wrists/ │ │ +│ │ ankles over 5-frame sliding window │ │ +│ │ • Threshold: >X px/frame limb movement │ │ +│ │ • Output: ALERT if confirmed │ │ +│ └─────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────┐ │ +│ │ B. Tawuran Detector │ │ +│ │ • Input: all person bboxes + keypoints │ │ +│ │ • Method 1: person_count > 15 in ROI │ │ +│ │ • Method 2: centroid convergence velocity │ │ +│ │ • Method 3: % of persons in fighting pose │ │ +│ │ • Output: ALERT if 2+ methods trigger │ │ +│ └─────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────┐ │ +│ │ C. Suspicious Activity Flagger │ │ +│ │ • Input: person tracks near defined zones │ │ +│ │ • Method: dwell time + hand movement pattern │ │ +│ │ • Output: TRIGGER TIER 2 for confirmation │ │ +│ └─────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + +### 9.3 Fighting Detection — Keypoint Velocity Algorithm + +```python +import numpy as np +from collections import defaultdict + +class FightingDetector: + """Detects fighting based on keypoint velocity analysis.""" + + WRIST_IDS = [9, 10] # left/right wrist + ANKLE_IDS = [15, 16] # left/right ankle + SHOULDER_IDS = [5, 6] # proximity reference + HIP_IDS = [11, 12] # proximity reference + + def __init__( + self, + velocity_threshold: float = 40.0, # pixels/frame + proximity_threshold: float = 150.0, # pixels (approx 1.5m at typical CCTV) + window_size: int = 5, # frames to analyze + min_aggressive_frames: int = 3, # frames exceeding threshold + ): + self.velocity_threshold = velocity_threshold + self.proximity_threshold = proximity_threshold + self.window_size = window_size + self.min_aggressive_frames = min_aggressive_frames + self.keypoint_history: dict[int, list[np.ndarray]] = defaultdict(list) + + def update(self, track_id: int, keypoints: np.ndarray): + """Store keypoints for a tracked person. keypoints shape: (17, 2).""" + history = self.keypoint_history[track_id] + history.append(keypoints.copy()) + if len(history) > self.window_size: + history.pop(0) + + def _limb_velocity(self, track_id: int) -> float: + """Compute max limb velocity over recent frames.""" + history = self.keypoint_history.get(track_id, []) + if len(history) < 2: + return 0.0 + + max_vel = 0.0 + limb_ids = self.WRIST_IDS + self.ANKLE_IDS + for i in range(1, len(history)): + for lid in limb_ids: + delta = np.linalg.norm(history[i][lid] - history[i-1][lid]) + max_vel = max(max_vel, delta) + return max_vel + + def _torso_distance(self, kp_a: np.ndarray, kp_b: np.ndarray) -> float: + """Distance between torso centers of two persons.""" + center_a = np.mean(kp_a[self.SHOULDER_IDS + self.HIP_IDS], axis=0) + center_b = np.mean(kp_b[self.SHOULDER_IDS + self.HIP_IDS], axis=0) + return np.linalg.norm(center_a - center_b) + + def check_pair(self, id_a: int, id_b: int) -> dict | None: + """Check if two tracked persons are fighting.""" + hist_a = self.keypoint_history.get(id_a, []) + hist_b = self.keypoint_history.get(id_b, []) + if not hist_a or not hist_b: + return None + + # Check proximity (must be close) + dist = self._torso_distance(hist_a[-1], hist_b[-1]) + if dist > self.proximity_threshold: + return None + + # Check velocity (both must have aggressive movement) + vel_a = self._limb_velocity(id_a) + vel_b = self._limb_velocity(id_b) + + if vel_a > self.velocity_threshold and vel_b > self.velocity_threshold: + confidence = min(1.0, (vel_a + vel_b) / (4 * self.velocity_threshold)) + return { + "type": "fighting", + "person_ids": [id_a, id_b], + "distance_px": dist, + "velocity_a": vel_a, + "velocity_b": vel_b, + "confidence": round(confidence, 3), + } + return None +``` + +### 9.4 Tawuran Detection — Crowd Density + Collective Motion + +```python +class TawuranDetector: + """Detects mass brawls via crowd density and collective aggression.""" + + def __init__( + self, + crowd_threshold: int = 15, # persons in ROI + convergence_threshold: float = 0.6, # 60% moving toward center + fighting_ratio_threshold: float = 0.3, # 30% in aggressive poses + ): + self.crowd_threshold = crowd_threshold + self.convergence_threshold = convergence_threshold + self.fighting_ratio_threshold = fighting_ratio_threshold + self.prev_centroids: dict[int, np.ndarray] = {} + + def analyze( + self, + tracked_persons: list[dict], # [{id, bbox, keypoints}, ...] + roi: tuple[int, int, int, int] | None = None, # (x1, y1, x2, y2) + fighting_detector: FightingDetector = None, + ) -> dict | None: + + # Filter to ROI + if roi: + in_roi = [p for p in tracked_persons if self._in_roi(p["bbox"], roi)] + else: + in_roi = tracked_persons + + person_count = len(in_roi) + if person_count < self.crowd_threshold: + return None + + # Check 1: Crowd density exceeded + signals = ["crowd_density"] + + # Check 2: Convergence — are people moving toward each other? + centroids = {p["id"]: self._centroid(p["bbox"]) for p in in_roi} + group_center = np.mean(list(centroids.values()), axis=0) + + converging = 0 + for pid, pos in centroids.items(): + prev = self.prev_centroids.get(pid) + if prev is not None: + prev_dist = np.linalg.norm(prev - group_center) + curr_dist = np.linalg.norm(pos - group_center) + if curr_dist < prev_dist: + converging += 1 + + self.prev_centroids = centroids + + if person_count > 0 and converging / person_count > self.convergence_threshold: + signals.append("convergence") + + # Check 3: Multiple fighting poses + if fighting_detector: + aggressive = sum( + 1 for p in in_roi + if fighting_detector._limb_velocity(p["id"]) > fighting_detector.velocity_threshold + ) + if person_count > 0 and aggressive / person_count > self.fighting_ratio_threshold: + signals.append("collective_aggression") + + if len(signals) >= 2: # at least 2 signals to confirm + return { + "type": "tawuran", + "person_count": person_count, + "signals": signals, + "confidence": min(1.0, len(signals) / 3), + } + return None + + def _in_roi(self, bbox, roi) -> bool: + cx = (bbox[0] + bbox[2]) / 2 + cy = (bbox[1] + bbox[3]) / 2 + return roi[0] <= cx <= roi[2] and roi[1] <= cy <= roi[3] + + def _centroid(self, bbox) -> np.ndarray: + return np.array([(bbox[0]+bbox[2])/2, (bbox[1]+bbox[3])/2]) +``` + +--- + +## 10. SlowFast / X3D Integration (Temporal Behavior Analysis) + +### 10.1 The Challenge: 1 FPS → Video Clips + +SlowFast and X3D expect **multi-frame video clips** (typically 8–32 frames at ≥8 FPS). Your pipeline captures at **1 FPS**. This mismatch requires a buffering strategy. + +| Approach | Clip Requirement | At 1 FPS | Clip Duration | Feasibility | +|----------|-----------------|----------|---------------|-------------| +| **SlowFast 8×8** | 8 frames, stride 8 | 8 frames = 8 seconds buffer | 8 seconds | ✅ Good — matches stealing/fighting duration | +| **SlowFast 4×16** | 4 frames, stride 16 | 4 frames = 4 seconds buffer | 4 seconds | ✅ Good — quick events | +| **X3D-S** | 13 frames, stride 6 | 13 frames = 13 seconds buffer | 13 seconds | ✅ Covers full tawuran build-up | +| **SlowFast 16×8** | 16 frames, stride 8 | 16 frames = 16 seconds | 16 seconds | ⚠️ Long buffer but better accuracy | + +> **Key insight:** At 1 FPS, the temporal resolution is low, but the clip duration is inherently long. SlowFast can still extract useful **coarse temporal patterns** (person appearing → approaching → grabbing → leaving) even at 1 FPS. For fine-grained motion (punch trajectories), the keypoint velocity from YOLOv26-Pose (Tier 1) is already covering this at per-frame level. + +### 10.2 When to Trigger SlowFast (On-Demand Only) + +SlowFast should **never** run continuously. It runs only when Tier 1 flags suspicious activity: + +| Trigger Source | Trigger Condition | SlowFast Task | Priority | +|---------------|-------------------|---------------|----------| +| Fighting detector | Fighting confidence > 0.5 but < 0.8 | Confirm/deny fight action | High | +| Suspicious activity | Person lingering near valuables > 10s | Classify stealing behavior | High | +| Tawuran detector | Crowd threshold met | Classify crowd action (running, fighting) | Medium | +| Periodic audit | Random 1% of cameras, every 60s | Background anomaly detection | Low | + +### 10.3 Per-Person Clip Buffer + +```python +from collections import deque +from dataclasses import dataclass, field +import numpy as np +import time + +@dataclass +class PersonClipBuffer: + """Maintains a rolling frame buffer per tracked person for SlowFast.""" + track_id: int + max_frames: int = 16 # 16 seconds at 1 FPS + frames: deque = field(default_factory=lambda: deque(maxlen=16)) + crops: deque = field(default_factory=lambda: deque(maxlen=16)) + timestamps: deque = field(default_factory=lambda: deque(maxlen=16)) + + # Memory estimate: 16 frames × 224×224×3 (crop) ≈ 2.4 MB per person + # 100 tracked persons ≈ 240 MB — manageable + + def add_frame(self, full_frame: np.ndarray, bbox: tuple, timestamp: float): + """Add a frame crop for this tracked person.""" + x1, y1, x2, y2 = [int(c) for c in bbox] + # Pad bbox by 20% for context + h, w = full_frame.shape[:2] + pad_x = int((x2 - x1) * 0.2) + pad_y = int((y2 - y1) * 0.2) + x1, y1 = max(0, x1 - pad_x), max(0, y1 - pad_y) + x2, y2 = min(w, x2 + pad_x), min(h, y2 + pad_y) + + crop = full_frame[y1:y2, x1:x2] + # Resize to SlowFast input size + crop_resized = cv2.resize(crop, (224, 224)) + + self.frames.append(full_frame) # keep full frame too (for RF-DETR) + self.crops.append(crop_resized) + self.timestamps.append(timestamp) + + def get_clip(self, num_frames: int = 8) -> np.ndarray | None: + """Get a clip tensor for SlowFast. Returns (T, H, W, C) array.""" + if len(self.crops) < num_frames: + return None + recent = list(self.crops)[-num_frames:] + return np.stack(recent, axis=0) # (T, 224, 224, 3) + + @property + def duration_seconds(self) -> float: + if len(self.timestamps) < 2: + return 0.0 + return self.timestamps[-1] - self.timestamps[0] + + @property + def memory_mb(self) -> float: + return len(self.crops) * 224 * 224 * 3 / (1024 * 1024) + + +class ClipBufferManager: + """Manages clip buffers for all tracked persons. Evicts stale tracks.""" + + def __init__(self, max_persons: int = 200, stale_timeout: float = 30.0): + self.buffers: dict[int, PersonClipBuffer] = {} + self.max_persons = max_persons + self.stale_timeout = stale_timeout + + def update(self, track_id: int, frame: np.ndarray, bbox: tuple, ts: float): + if track_id not in self.buffers: + if len(self.buffers) >= self.max_persons: + self._evict_oldest() + self.buffers[track_id] = PersonClipBuffer(track_id=track_id) + self.buffers[track_id].add_frame(frame, bbox, ts) + + def get_clip(self, track_id: int, num_frames: int = 8) -> np.ndarray | None: + buf = self.buffers.get(track_id) + return buf.get_clip(num_frames) if buf else None + + def _evict_oldest(self): + """Remove the track with the oldest last-seen timestamp.""" + if not self.buffers: + return + oldest_id = min(self.buffers, key=lambda k: self.buffers[k].timestamps[-1]) + del self.buffers[oldest_id] + + def cleanup_stale(self, current_time: float): + """Remove tracks not seen for stale_timeout seconds.""" + stale = [ + tid for tid, buf in self.buffers.items() + if current_time - buf.timestamps[-1] > self.stale_timeout + ] + for tid in stale: + del self.buffers[tid] + + @property + def total_memory_mb(self) -> float: + return sum(buf.memory_mb for buf in self.buffers.values()) +``` + +### 10.4 SlowFast Inference Integration + +```python +import torch +from pytorchvideo.models.hub import slowfast_r50_detection + +class SlowFastAnalyzer: + """Tier 2 temporal behavior analysis using SlowFast.""" + + # AVA action labels relevant to our use case + FIGHTING_ACTIONS = {"hit", "kick", "punch", "push", "grab", "fight"} + STEALING_ACTIONS = {"take", "pick_up", "carry", "put_down", "grab"} + + def __init__(self, device: str = "cuda:1"): + self.device = device + self.model = slowfast_r50_detection() + self.model.to(device).eval() + + @torch.no_grad() + def analyze_clip( + self, + clip: np.ndarray, # (T, 224, 224, 3) + behavior_type: str, # "fighting" or "stealing" + ) -> dict: + """Run SlowFast on a clip buffer.""" + # Preprocess: (T,H,W,C) → (C,T,H,W) normalized + clip_tensor = torch.from_numpy(clip).float().permute(3, 0, 1, 2) / 255.0 + clip_tensor = clip_tensor.unsqueeze(0).to(self.device) # (1,C,T,H,W) + + # SlowFast expects [slow_pathway, fast_pathway] + # Slow: every 8th frame, Fast: all frames + slow = clip_tensor[:, :, ::8, :, :] # temporal stride 8 + fast = clip_tensor # full temporal resolution + + preds = self.model([slow, fast]) + + # Map predictions to relevant actions + target_actions = ( + self.FIGHTING_ACTIONS if behavior_type == "fighting" + else self.STEALING_ACTIONS + ) + + action_scores = {} # action_name → confidence + for action, idx in ACTION_LABEL_MAP.items(): + if action in target_actions: + action_scores[action] = float(preds[0, idx].cpu()) + + top_action = max(action_scores, key=action_scores.get) + return { + "behavior": behavior_type, + "top_action": top_action, + "confidence": action_scores[top_action], + "all_scores": action_scores, + } +``` + +### 10.5 Where SlowFast Fits in the Hybrid Architecture + +``` +Tier 1 (GPU 0) flags suspicious activity + │ + │ trigger_type: "fighting_uncertain" or "stealing_suspicious" + │ includes: track_id, camera_id, confidence + ▼ +┌──────────────────────────────────────────────┐ +│ ClipBufferManager.get_clip(track_id, 8) │ +│ → Returns (8, 224, 224, 3) ndarray │ +│ → If clip not ready (< 8 frames), WAIT │ +│ and re-trigger when buffer fills │ +└──────────────────────┬───────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────┐ +│ SlowFastAnalyzer.analyze_clip(clip, type) │ +│ → Runs on GPU 1 (shared with RF-DETR) │ +│ → ~50–80 ms per clip │ +│ → Returns action classification + conf │ +└──────────────────────┬───────────────────────┘ + │ + ┌──────────┴──────────┐ + │ │ + confidence > 0.7 confidence ≤ 0.7 + │ │ + CONFIRM ALERT DISMISS / LOG +``` + +--- + +## 11. RF-DETR Integration (Tier 2 High-Accuracy Confirmation) + +### 11.1 GPU Assignment Strategy + +**Recommended: Dedicated GPU per tier.** + +| Component | GPU 0 (V100 #1) | GPU 1 (V100 #2) | +|-----------|-----------------|-----------------| +| YOLOv26-S-Pose | ✅ Always loaded | ❌ | +| ByteTrack | ✅ CPU-side | ✅ CPU-side | +| RF-DETR-M | ❌ | ✅ On-demand | +| SlowFast / X3D-S | ❌ | ✅ On-demand | +| YOLOE-26 | ❌ | ✅ On-demand | + +**Why separate GPUs:** +- Tier 1 must have **guaranteed latency** — no competition for CUDA streams +- Tier 2 models share GPU 1 sequentially (they're triggered sporadically, not continuously) +- If Tier 2 queue backs up, Tier 1 is unaffected + +### 11.2 Non-Blocking Routing to RF-DETR + +The key requirement: triggering RF-DETR must **never block** the main YOLO pipeline. + +```python +import asyncio +import torch +from rfdetr import RFDETRMedium +from queue import PriorityQueue +from dataclasses import dataclass, field +from typing import Any + +@dataclass(order=True) +class Tier2Request: + priority: int # lower = higher priority + timestamp: float = field(compare=False) + camera_id: str = field(compare=False) + frame: Any = field(compare=False) # np.ndarray + trigger_type: str = field(compare=False) # "stealing", "tawuran", "uncertain" + track_ids: list = field(compare=False, default_factory=list) + tier1_confidence: float = field(compare=False, default=0.0) + +class Tier2Processor: + """Runs on GPU 1. Processes RF-DETR / SlowFast requests from a priority queue.""" + + PRIORITY_MAP = {"fighting_uncertain": 1, "stealing": 2, "tawuran": 3, "audit": 10} + + def __init__(self, device: str = "cuda:1", max_queue: int = 100): + self.device = device + self.queue = PriorityQueue(maxsize=max_queue) + + # Load models on GPU 1 + self.rfdetr = RFDETRMedium() # fine-tuned weights + self.slowfast = SlowFastAnalyzer(device=device) + + # Lazy-load YOLOE only when needed + self._yoloe = None + + @property + def yoloe(self): + if self._yoloe is None: + from ultralytics import YOLO + self._yoloe = YOLO("yoloe-26s.pt").to(self.device) + return self._yoloe + + def submit(self, request: Tier2Request) -> bool: + """Non-blocking submit. Returns False if queue full (drops request).""" + try: + self.queue.put_nowait(request) + return True + except: + return False # backpressure: drop lowest-priority requests + + async def process_loop(self): + """Main Tier 2 processing loop. Runs in background.""" + while True: + if self.queue.empty(): + await asyncio.sleep(0.01) + continue + + request = self.queue.get() + result = await self._process_request(request) + + if result and result["confirmed"]: + await self._dispatch_alert(request, result) + + async def _process_request(self, req: Tier2Request) -> dict: + if req.trigger_type == "stealing": + return await self._confirm_stealing(req) + elif req.trigger_type == "fighting_uncertain": + return await self._confirm_fighting(req) + elif req.trigger_type == "tawuran": + return await self._confirm_tawuran(req) + + async def _confirm_stealing(self, req: Tier2Request) -> dict: + """Multi-model stealing confirmation.""" + # Step 1: RF-DETR high-accuracy detection + rfdetr_detections = self.rfdetr.predict(req.frame, threshold=0.5) + + # Step 2: YOLOE zero-shot screening + self.yoloe.set_classes([ + "person concealing object", + "hand reaching into bag", + "person hiding item under clothing", + "shoplifting", + ]) + yoloe_results = self.yoloe(req.frame) + + # Step 3: SlowFast temporal analysis (if clip available) + clip = clip_buffer_manager.get_clip(req.track_ids[0], num_frames=8) + slowfast_result = None + if clip is not None: + slowfast_result = self.slowfast.analyze_clip(clip, "stealing") + + # Aggregate: require at least 2/3 models to agree + votes = 0 + if len(rfdetr_detections) > 0: + votes += 1 + if len(yoloe_results[0].boxes) > 0: + votes += 1 + if slowfast_result and slowfast_result["confidence"] > 0.6: + votes += 1 + + return { + "confirmed": votes >= 2, + "votes": votes, + "rfdetr_count": len(rfdetr_detections), + "yoloe_count": len(yoloe_results[0].boxes), + "slowfast": slowfast_result, + } + + async def _confirm_tawuran(self, req: Tier2Request) -> dict: + """RF-DETR for accurate person count in dense crowd.""" + detections = self.rfdetr.predict(req.frame, threshold=0.3) + person_dets = [d for d in detections if d.class_id == 0] + + return { + "confirmed": len(person_dets) >= 15, + "accurate_count": len(person_dets), + "tier1_estimate": req.tier1_confidence, + } + + async def _dispatch_alert(self, req: Tier2Request, result: dict): + """Send confirmed alert to alert system.""" + alert = { + "type": req.trigger_type, + "camera_id": req.camera_id, + "timestamp": req.timestamp, + "result": result, + "frame_evidence": req.frame, # or save to GridFS + } + # → WebSocket push, API call, database insert, etc. + await alert_dispatcher.send(alert) +``` + +### 11.3 RF-DETR Fine-Tuning for Stealing Detection + +```python +from rfdetr import RFDETRMedium +from rfdetr.config import TrainConfig + +def fine_tune_stealing_detector(): + """Fine-tune RF-DETR-M on custom stealing dataset.""" + + model = RFDETRMedium() + + config = TrainConfig( + dataset_dir="./data/stealing_dataset", # COCO format + num_classes=4, # normal_interaction, concealing, grabbing, fleeing + epochs=50, + batch_size=8, + lr=1e-4, # lower LR for fine-tuning + grad_accum_steps=4, # effective batch 32 + resolution=576, # RF-DETR-M native resolution + augmentation="heavy", + ) + + # RF-DETR proven to generalize well on small datasets (RF100-VL benchmark) + # Minimum recommended: 500 images per class + # Optimal: 2,000+ images per class + + model.train(config) + model.save("rfdetr_m_stealing_v1.pt") +``` + +### 11.4 Dataset Requirements for Stealing Fine-Tuning + +| Class | Description | Min Images | Sources | +|-------|-------------|-----------|---------| +| `normal_interaction` | Person normally handling items | 1,000 | Existing CCTV footage | +| `concealing` | Person hiding item in clothing/bag | 500 | Staged + real footage | +| `grabbing` | Hand reaching for unattended item | 500 | Staged + retail CCTV | +| `fleeing` | Person rapidly leaving after taking | 300 | Staged scenarios | + +--- + +## 12. Hardware Specifications + +### 12.0 Design Principles + +All specifications account for: +- **NVDEC zero-copy decode** — V100's built-in NVDEC engine decodes H.264/H.265 on dedicated fixed-function hardware, consuming 0% CUDA/Tensor cores and ~0.3 GB VRAM per GPU. Frames stay in GPU memory (no CPU copy). +- **NVDEC session limit** — Each V100 has 1 NVDEC engine supporting ~24–32 concurrent decode sessions. Cameras beyond this limit fall back to CPU software decode at ~0.05 core/stream (lightweight at 1 FPS). +- **1 FPS capture rate** — At 1 frame/second, CPU decode overhead is minimal even in software fallback. +- **Async bulk MongoDB writes** — Using `motor` (async driver) with `insert_many` batches of 100–500 docs, `ordered=False`. + +### 12.1 GPU Internal Resource Allocation + +NVDEC and CUDA/Tensor cores are **physically separate silicon** inside the V100. They run in parallel with zero interference: + +``` +┌─────────────────────────────────────────────────────────────┐ +│ NVIDIA V100 32 GB │ +│ │ +│ ┌──────────────────────────────┐ ┌─────────────────────┐ │ +│ │ CUDA + Tensor Cores │ │ NVDEC Engine │ │ +│ │ (5,120 CUDA + 640 Tensor) │ │ (fixed-function) │ │ +│ │ │ │ │ │ +│ │ • YOLOv26-Pose inference │ │ • H.264 decode │ │ +│ │ • RF-DETR inference │ │ • H.265 decode │ │ +│ │ • SlowFast inference │ │ • 24–32 sessions │ │ +│ │ • Preprocessing (resize) │ │ • ~0.3 GB VRAM │ │ +│ │ │ │ │ │ +│ │ 100% available for AI │ │ Runs in PARALLEL │ │ +│ │ workloads regardless of │ │ with CUDA cores │ │ +│ │ NVDEC activity │ │ at zero cost │ │ +│ └──────────────────────────────┘ └─────────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────┐ │ +│ │ HBM2 VRAM (32 GB) — shared, only contention point │ │ +│ │ NVDEC uses ~0.3 GB → 31.7 GB free for inference │ │ +│ └──────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────┘ +``` + +**Impact:** NVDEC decode is effectively "free" — no separate decode GPU needed, no CPU core savings to calculate. The V100 handles both decode and inference simultaneously. + +### 12.2 CPU Load Breakdown (per camera at 1 FPS) + +| Task | Without NVDEC (CPU decode) | With NVDEC (GPU decode) | Notes | +|------|---------------------------|------------------------|-------| +| RTSP network I/O | 0.01 core | 0.01 core | Same — network stack is always CPU | +| H.264 frame decode | 0.30–0.50 core | **0.00 core** | Offloaded to NVDEC | +| Frame copy CPU→GPU | 0.02 core | **0.00 core** | Zero-copy: frame stays on GPU | +| ByteTrack tracking | 0.02 core | 0.02 core | CPU-side, lightweight | +| Behavior analysis | 0.01 core | 0.01 core | NumPy math on keypoint floats | +| Ring buffer management | 0.01 core | 0.01 core | deque operations | +| JPEG encode (crops) | 0.03 core | 0.03 core | CPU (or NVJPEG on GPU) | +| MongoDB async write | 0.01 core | 0.01 core | Amortized via bulk writes | +| **Total per camera** | **0.41–0.61 core** | **0.09 core** | **~78–85% reduction** | + +| Camera Count | CPU Cores (no NVDEC) | CPU Cores (with NVDEC) | Savings | +|-------------|---------------------|----------------------|---------| +| 100 | 41–61 cores | **9 cores** | 78–85% | +| 300 | 123–183 cores | **27 cores** | 78–85% | +| 500 | 205–305 cores | **45 cores** | 78–85% | +| 800 | 328–488 cores | **72 cores** | 78–85% | +| 1,000 | 410–610 cores | **90 cores** | 78–85% | + +> **Note:** Cameras 1–24 use GPU 0 NVDEC, cameras 25–48 use GPU 1 NVDEC (48 total hardware-decoded). Cameras 49+ fall back to CPU software decode at 1 FPS, which is still very light (~0.05 core/stream vs 0.3–0.5 for persistent decode). The 0.09 core/camera figure above is the blended average. + +--- + +### 12.3 Spec A — With Behavior Detection (Fighting + Stealing + Tawuran) + +#### Processing Pipeline + +``` +Camera RTSP → NVDEC (GPU decode) → GPU Memory → YOLOv26-Pose (Tensor Cores) + │ + bbox + keypoints (tiny, few KB) + │ + copy to CPU + │ + ┌───────────────┼───────────────┐ + ▼ ▼ ▼ + ByteTrack Fighting Det. Tawuran Det. + (tracking) (keypoint vel.) (crowd density) + │ │ │ + └───────┬───────┘ │ + ▼ ▼ + Suspicious? ──────────→ Tier 2 (GPU 1) + RF-DETR / SlowFast + │ + Confirmed? + │ + ▼ ▼ + Alert MongoDB +``` + +#### GPU Allocation + +| | GPU 0 (V100 #1) — Tier 1 | GPU 1 (V100 #2) — Tier 2 | +|--|--------------------------|--------------------------| +| **NVDEC** | Cameras 1–24 (H.264 decode) | Cameras 25–48 (H.264 decode) | +| **Inference** | YOLOv26-S-Pose (always-on) | RF-DETR-M + X3D-S + YOLOE-26 (on-demand) | +| **Role** | Primary detection + pose | Confirmation + temporal analysis | + +#### VRAM Budget + +| Component | GPU 0 | GPU 1 | +|-----------|-------|-------| +| NVDEC decode buffers (24 streams) | 0.3 GB | 0.3 GB | +| YOLOv26-S-Pose weights (TensorRT FP16) | 0.8 GB | — | +| RF-DETR-M weights | — | 2.5 GB | +| X3D-S weights | — | 0.8 GB | +| YOLOE-26-S weights | — | 1.0 GB | +| CUDA context + allocator | 0.8 GB | 0.8 GB | +| Inference batch buffers (batch=16) | 0.5 GB | 0.6 GB | +| Preprocessing buffers | 0.2 GB | 0.2 GB | +| **Total used** | **2.6 GB** | **6.2 GB** | +| **Free** | **29.4 GB** | **25.8 GB** | + +#### Full Bill of Materials + +| Component | Specification | Purpose | Est. Cost (USD) | +|-----------|--------------|---------|-----------------| +| **CPU** | AMD EPYC 9374F (32C/64T, 3.85 GHz) | RTSP I/O, ByteTrack, behavior analysis, overflow decode. 72 cores needed for 800 cams → 32C with HT (64T) covers it with headroom. | $2,500–3,000 | +| **Motherboard** | Supermicro H13SSL-N or equiv. (SP5 socket) | Single-socket EPYC, 2× PCIe Gen5 x16 for dual V100 | $500–700 | +| **RAM** | 128 GB DDR5-4800 ECC RDIMM (4×32 GB) | Ring buffers (~3 frames × 1080p × 800 cams ≈ 15 GB), clip buffers (~240 MB per 100 tracked persons), batch queues, OS overhead | $400–500 | +| **GPU 0** | NVIDIA V100 32 GB (already owned) | Tier 1: NVDEC decode (24 streams) + YOLOv26-S-Pose inference | — | +| **GPU 1** | NVIDIA V100 32 GB (already owned) | Tier 2: NVDEC decode (24 streams) + RF-DETR + X3D-S + YOLOE-26 | — | +| **NIC** | Mellanox ConnectX-5 25 GbE (dual-port) | 800 cameras × ~1 Mbps = 800 Mbps sustained. 25 GbE provides 3× headroom. Dual-port for redundancy. | $150–200 | +| **Network Switch** | 10/25 GbE managed switch (if cameras are IP-based) | Aggregate camera traffic. May need multiple switches for 800 cameras. | $500–2,000 (varies) | +| **Boot SSD** | 512 GB NVMe (PCIe Gen4) | OS (Ubuntu 22.04/24.04), CUDA toolkit, Python env, model weights (~5 GB total) | $50–70 | +| **Data SSD** | 2 TB NVMe (Samsung 990 Pro or Micron 7450) | MongoDB data directory (WiredTiger engine), alert clip buffer (10-sec clips per alert) | $150–200 | +| **Archive HDD** | 2× 8 TB HDD (RAID 1 mirror) | Long-term evidence storage. At 100 alerts/day × 10-sec clip × 5 MB = 500 MB/day → 8 TB lasts ~16,000 days. | $240–300 | +| **PSU** | 1,200W 80+ Platinum (redundant if rackmount) | 2× V100 (300W each) + EPYC 9374F (320W) + drives + fans = ~1,000W peak | $200–300 | +| **Chassis** | 4U rackmount server (e.g., Supermicro 4124GS-TNR) | Dual GPU cooling, sufficient airflow for V100 TDP | $300–500 | +| **UPS** | 1,500VA online UPS | Protect against power loss during inference + MongoDB writes | $300–400 | +| | | | | +| **Total (excl. V100s)** | | | **$5,300–8,200** | +| **Total (incl. V100s at ~$3,000 each used)** | | | **$11,300–14,200** | + +#### MongoDB Sizing + +| Cameras | Detections/sec (avg 5 persons/frame) | Crops/sec | Write Throughput | Daily Storage | MongoDB Config | +|---------|--------------------------------------|-----------|-----------------|---------------|---------------| +| 100 | 500 | 500 | ~5 MB/s | ~430 GB | Single node, WiredTiger, NVMe | +| 300 | 1,500 | 1,500 | ~15 MB/s | ~1.3 TB | Single node, NVMe, compression | +| 500 | 2,500 | 2,500 | ~25 MB/s | ~2.2 TB | Replica set (3 nodes) | +| 800 | 4,000 | 4,000 | ~40 MB/s | ~3.5 TB | Sharded cluster (2 shards) | + +> **Storage note:** At 800 cameras, you generate ~3.5 TB/day of crop images. Implement a **TTL index** on MongoDB to auto-delete crops older than 7–30 days, or archive to cold storage (HDD/S3). + +#### Power & Cooling + +| Component | TDP / Draw | Notes | +|-----------|-----------|-------| +| 2× V100 32 GB | 600W (300W each) | Requires active cooling, 250W sustained typical | +| EPYC 9374F | 320W | High-clock SKU, sustained ~250W under load | +| NVMe SSDs (×2) | 15W | Negligible | +| HDDs (×2) | 20W | Negligible | +| NIC + fans + misc | 45W | — | +| **Total system** | **~1,000W sustained** | Peak ~1,100W | +| **Cooling** | ~3,400 BTU/hr | Standard 4U rackmount cooling sufficient | + +--- + +### 12.4 Spec B — Without Behavior Detection (Detection + Crop + MongoDB Only) + +This configuration runs YOLOv26 for object detection and bounding box extraction only. No pose estimation, no tracking, no fighting/stealing/tawuran analysis. No Tier 2 models. + +#### Processing Pipeline (Simplified) + +``` +Camera RTSP → NVDEC (GPU decode) → GPU Memory → YOLOv26-S (Tensor Cores) + │ + bbox + class + confidence + │ + crop regions from frame + │ + JPEG encode → MongoDB +``` + +#### GPU Allocation + +| | GPU 0 (V100 #1) | GPU 1 (V100 #2) | +|--|-----------------|-----------------| +| **NVDEC** | Cameras 1–24 | Cameras 25–48 | +| **Inference** | YOLOv26-S (detection only, no pose) | YOLOv26-S (overflow / load balance) | +| **Role** | Primary detection | Overflow detection (or idle) | + +> **Option:** Run both V100s for Tier 1 detection to double throughput, or leave GPU 1 idle to save power (~300W). + +#### VRAM Budget (Single GPU Mode) + +| Component | GPU 0 | GPU 1 (idle or overflow) | +|-----------|-------|--------------------------| +| NVDEC decode buffers | 0.3 GB | 0.3 GB (if used) | +| YOLOv26-S weights (TensorRT FP16) | 0.6 GB | 0.6 GB (if used) | +| CUDA context | 0.8 GB | 0.8 GB (if used) | +| Batch buffers (batch=16) | 0.5 GB | 0.5 GB (if used) | +| **Total used** | **2.2 GB** | **2.2 GB** | +| **Free** | **29.8 GB** | **29.8 GB** | + +#### Full Bill of Materials + +| Component | Specification | Purpose | Est. Cost (USD) | +|-----------|--------------|---------|-----------------| +| **CPU** | AMD EPYC 9274F (24C/48T, 4.05 GHz) or Intel Xeon w5-2465X (16C/32T) | RTSP I/O, overflow decode, JPEG encode, MongoDB writes. No tracking or behavior analysis. | $1,500–2,000 | +| **Motherboard** | Supermicro H13SSL-N or equiv. | Single-socket, 1–2× PCIe x16 | $500–700 | +| **RAM** | 64 GB DDR5-4800 ECC RDIMM (2×32 GB) | Ring buffers only (no clip buffers, no tracking history). 800 cams × 3 frames × 6 MB ≈ 14 GB. | $200–250 | +| **GPU** | 1× NVIDIA V100 32 GB (already owned) | NVDEC decode + YOLOv26-S inference. Second V100 optional for 2× throughput. | — | +| **NIC** | Intel X710-DA2 10 GbE (dual-port) | 500 cameras × ~1 Mbps = 500 Mbps. 10 GbE sufficient. | $80–120 | +| **Boot SSD** | 512 GB NVMe | OS + model weights | $50–70 | +| **Data SSD** | 2 TB NVMe | MongoDB data directory | $150–200 | +| **Archive HDD** | 1× 8 TB HDD | Long-term crop storage (optional) | $120–150 | +| **PSU** | 850W 80+ Gold | 1× V100 (300W) + EPYC 9274F (250W) + overhead | $120–150 | +| **Chassis** | 4U rackmount or tower workstation | Single GPU cooling | $200–400 | +| **UPS** | 1,000VA online UPS | Power protection | $200–250 | +| | | | | +| **Total (excl. V100)** | | | **$3,100–5,100** | +| **Total (incl. 1× V100 at ~$3,000 used)** | | | **$6,100–8,100** | + +--- + +### 12.5 Comprehensive Comparison + +| Specification | **With Behavior Detection** | **Without Behavior Detection** | +|--------------|---------------------------|-------------------------------| +| | *Fighting + Stealing + Tawuran* | *Detection + Crop + Save only* | +| | | | +| **CPU** | EPYC 9374F (32C/64T) | EPYC 9274F (24C/48T) | +| **CPU Cores needed** | ~72 (800 cams) | ~45 (800 cams) | +| **RAM** | 128 GB DDR5 ECC | 64 GB DDR5 ECC | +| **GPUs active** | 2× V100 32 GB | 1× V100 32 GB | +| **GPU 0 role** | Tier 1: NVDEC + YOLOv26-Pose | NVDEC + YOLOv26-S | +| **GPU 1 role** | Tier 2: NVDEC + RF-DETR + X3D-S + YOLOE | Idle (or overflow) | +| **GPU VRAM used** | 2.6 GB + 6.2 GB = 8.8 GB | 2.2 GB (single GPU) | +| **GPU VRAM free** | 29.4 + 25.8 = 55.2 GB | 29.8 GB | +| **NIC** | 25 GbE | 10 GbE | +| **Data SSD** | 2 TB NVMe | 2 TB NVMe | +| **PSU** | 1,200W Platinum | 850W Gold | +| **System power** | ~1,000W sustained | ~600W sustained | +| **Cooling** | ~3,400 BTU/hr | ~2,050 BTU/hr | +| | | | +| **Max cameras (1 FPS, batch=16)** | **800–1,120** | **1,120–2,240** | +| **Bottleneck** | GPU 1 (Tier 2 at high trigger rates) | GPU 0 (Tier 1 compute) | +| | | | +| **Models loaded** | YOLOv26-Pose + RF-DETR + X3D-S + YOLOE | YOLOv26-S only | +| **Tracking** | ✅ ByteTrack (person identity across frames) | ❌ No tracking | +| **Fighting detection** | ✅ Keypoint velocity + proximity | ❌ | +| **Stealing detection** | ✅ Multi-model confirmation | ❌ | +| **Tawuran detection** | ✅ Crowd density + convergence | ❌ | +| **Temporal analysis** | ✅ X3D-S / SlowFast on video clips | ❌ | +| **Zero-shot detection** | ✅ YOLOE-26 text prompts | ❌ | +| | | | +| **Cost (excl. V100s)** | **$5,300–8,200** | **$3,100–5,100** | +| **Cost (incl. V100s)** | **$11,300–14,200** | **$6,100–8,100** | +| **Cost difference** | — | **Saves $5,200–6,100** | + +### 12.6 Scaling Guide — When to Upgrade + +| Camera Count | Config Needed | CPU | RAM | GPUs | Notes | +|-------------|--------------|-----|-----|------|-------| +| **1–100** | Minimal | 16C (Xeon w5-2445) | 32 GB | 1× V100 | Single GPU handles everything | +| **100–300** | Standard | 24C (EPYC 9274F) | 64 GB | 1× V100 | Add behavior detection with same GPU | +| **300–600** | Recommended | 32C (EPYC 9374F) | 128 GB | 2× V100 | Full hybrid pipeline | +| **600–1,000** | Full | 32C (EPYC 9374F) | 128 GB | 2× V100 | Optimize batch size, reduce trigger rate | +| **1,000–2,000** | Scale-out | 64C (EPYC 9554) | 256 GB | 4× V100 or 2× A100 | Second server or larger GPU | +| **2,000+** | Distributed | Multi-server | 256+ GB each | GPU per server | Kubernetes / distributed inference | + +### 12.7 Network Infrastructure + +| Cameras | Bandwidth (1 FPS, H.264) | NIC Required | Switch Requirement | +|---------|-------------------------|-------------|-------------------| +| 100 | ~100 Mbps | 1 GbE (sufficient) | Standard managed switch | +| 300 | ~300 Mbps | 10 GbE | 10 GbE aggregation switch | +| 500 | ~500 Mbps | 10 GbE | 10 GbE with LACP bonding | +| 800 | ~800 Mbps | 25 GbE | 25 GbE aggregation | +| 1,000+ | ~1 Gbps+ | 25 GbE (dual-port) | 25 GbE with redundancy | + +> **RTSP bandwidth note:** Each 1080p H.264 stream at 1 FPS uses ~0.5–1.5 Mbps depending on scene complexity and I-frame interval. The estimates above use ~1 Mbps average. + +### 12.8 MongoDB Storage Planning + +| Cameras | Crops/day (5 persons/frame avg) | Storage/day (50 KB avg crop) | 30-day retention | Recommended | +|---------|-------------------------------|-----------------------------|--------------------|-------------| +| 100 | 432,000 | 21 GB | 630 GB | 1 TB NVMe | +| 300 | 1,296,000 | 63 GB | 1.9 TB | 2 TB NVMe | +| 500 | 2,160,000 | 105 GB | 3.2 TB | 4 TB NVMe | +| 800 | 3,456,000 | 168 GB | 5.0 TB | 2× 4 TB NVMe (RAID 0) | + +> **Recommendation:** Set a MongoDB **TTL index** to auto-expire documents after 7–30 days. Archive flagged alerts (with behavior detections) to cold storage before expiry. + +## 13. V100 32 GB Deployment Estimate (2× GPUs, 1 FPS per Camera) + +### 13.1 Baseline Latency on V100 + +V100 delivers ~1.5–2× the FP16 throughput of T4 (125 vs 65 TFLOPS). All estimates use **TensorRT FP16**. + +| Model | T4 Benchmark | V100 Estimated | Per-inference VRAM | +|-------|-------------|----------------|-------------------| +| YOLOv26-S-Pose | 2.6 ms | **~1.5 ms** | ~0.8 GB (weights) | +| YOLOv26-M-Pose | 4.7 ms | **~2.8 ms** | ~1.5 GB | +| RF-DETR-M | 4.4 ms | **~2.7 ms** | ~2.5 GB | +| RF-DETR-L | 6.8 ms | **~4.2 ms** | ~3.2 GB | +| SlowFast R50 | 80–120 ms | **~50–80 ms** | ~1.5 GB | +| X3D-S | 40–60 ms | **~25–40 ms** | ~0.8 GB | +| YOLOE-26-S | 2.6 ms | **~1.5 ms** | ~1.0 GB | + +### 13.2 Scenario (a): YOLOv26-Pose Only (Tier 1 Always-On) + +**GPU 0 only.** GPU 1 idle. Detects: fighting, tawuran (no stealing confirmation). + +| Model | Latency/frame | GPU 0 Budget: 1000 ms | GPU utilization | +|-------|--------------|----------------------|-----------------| +| YOLOv26-S-Pose | 1.5 ms | 1000 / 1.5 = **666 frames/sec** | — | + +**VRAM Allocation (GPU 0):** + +| Component | VRAM | +|-----------|------| +| YOLOv26-S-Pose weights | 0.8 GB | +| CUDA context | 0.8 GB | +| Batch buffer (batch=32, 640×640×3×FP16) | 0.8 GB | +| **Total** | **2.4 GB** | +| **Free** | **29.6 GB** | + +| Batch Size | Throughput (frames/sec) | Practical (70% util) | **Max Cameras (1 FPS)** | +|-----------|------------------------|---------------------|------------------------| +| 1 | 666 | 466 | **466** | +| 8 | ~1,200 | 840 | **840** | +| 16 | ~1,600 | 1,120 | **1,120** | +| 32 | ~1,900 | 1,330 | **1,330** | + +> **With 2× V100 both running YOLOv26-Pose:** double all numbers → **2,240–2,660 cameras** at batch=16–32. + +### 13.3 Scenario (b): YOLOv26-Pose + RF-DETR Hybrid (10–20% Trigger) + +**GPU 0:** YOLOv26-S-Pose (always-on) +**GPU 1:** RF-DETR-M (on-demand, triggered by 10–20% of frames) + +**VRAM Allocation:** + +| | GPU 0 | GPU 1 | +|--|-------|-------| +| YOLOv26-S-Pose | 0.8 GB | — | +| RF-DETR-M | — | 2.5 GB | +| YOLOE-26-S (lazy) | — | 1.0 GB | +| CUDA context | 0.8 GB | 0.8 GB | +| Batch buffers | 0.8 GB | 0.6 GB | +| **Total** | **2.4 GB** | **4.9 GB** | +| **Free** | **29.6 GB** | **27.1 GB** | + +**Throughput Calculation:** + +GPU 0 bottleneck (Tier 1 — determines max cameras): + +| Batch | YOLOv26-S-Pose throughput | 70% util | Max cameras | +|-------|--------------------------|----------|-------------| +| 8 | 1,200 fps | 840 | **840** | +| 16 | 1,600 fps | 1,120 | **1,120** | +| 32 | 1,900 fps | 1,330 | **1,330** | + +GPU 1 bottleneck (Tier 2 — must keep up with trigger rate): + +| Cameras | Trigger Rate | Tier 2 frames/sec | RF-DETR-M capacity (370 fps) | Headroom | +|---------|-------------|-------------------|------------------------------|----------| +| 500 | 10% | 50 fps | 370 fps | ✅ 7.4× | +| 500 | 20% | 100 fps | 370 fps | ✅ 3.7× | +| 800 | 10% | 80 fps | 370 fps | ✅ 4.6× | +| 800 | 20% | 160 fps | 370 fps | ✅ 2.3× | +| 1,000 | 20% | 200 fps | 370 fps | ✅ 1.8× | +| 1,330 | 20% | 266 fps | 370 fps | ✅ 1.4× | + +> **Tier 2 is never the bottleneck** at these camera counts and trigger rates. GPU 1 has massive headroom. + +**Final capacity for Scenario (b):** + +| Batch | **Max Cameras (1 FPS)** | Bottleneck | +|-------|------------------------|------------| +| 8 | **840** | GPU 0 (Tier 1) | +| 16 | **1,120** | GPU 0 (Tier 1) | +| 32 | **1,330** | GPU 0 (Tier 1) | + +### 13.4 Scenario (c): Full Pipeline (YOLOv26-Pose + RF-DETR + SlowFast) + +**GPU 0:** YOLOv26-S-Pose (always-on) +**GPU 1:** RF-DETR-M + SlowFast R50 + YOLOE-26-S (on-demand, time-shared) + +**VRAM Allocation:** + +| | GPU 0 | GPU 1 | +|--|-------|-------| +| YOLOv26-S-Pose | 0.8 GB | — | +| RF-DETR-M | — | 2.5 GB | +| SlowFast R50 | — | 1.5 GB | +| YOLOE-26-S | — | 1.0 GB | +| CUDA context | 0.8 GB | 0.8 GB | +| Batch buffers | 0.8 GB | 1.0 GB | +| **Total** | **2.4 GB** | **6.8 GB** | +| **Free** | **29.6 GB** | **25.2 GB** | + +> VRAM is comfortable even with all 3 Tier 2 models loaded simultaneously. + +**Throughput — GPU 1 becomes the constraint when SlowFast is active:** + +SlowFast is expensive (~50–80 ms per clip). The question is how often it triggers. + +| Trigger Source | Rate | SlowFast calls/sec | Time consumed | +|---------------|------|-------------------|---------------| +| Fighting uncertain (need temporal confirm) | 2% of cameras | At 800 cams: 16/sec | 16 × 60ms = **960 ms** ⚠️ | +| Stealing suspicious | 5% of cameras | At 800 cams: 40/sec | 40 × 60ms = **2,400 ms** ❌ | + +> **SlowFast at 5% trigger rate on 800 cameras would exceed GPU 1's 1-second budget.** Solutions: + +| Solution | Effect | +|----------|--------| +| **Reduce SlowFast trigger rate** to <2% | 800 × 2% = 16 calls/sec × 60ms = 960 ms ✅ fits | +| **Use X3D-S instead of SlowFast** | 25–40 ms vs 50–80 ms → 2× more capacity | +| **Batch SlowFast clips** | Batch=4 clips → ~120 ms total vs 4×60ms = 240 ms | +| **Time-share with RF-DETR** | RF-DETR handles most triggers; SlowFast only for uncertain cases | + +**Practical capacity with mixed workload on GPU 1:** + +Assume: RF-DETR handles 15% of triggers, SlowFast handles 2%, YOLOE handles 3%. + +| Cameras | RF-DETR (15%) | SlowFast (2%) | YOLOE (3%) | Total GPU 1 time/sec | Feasible? | +|---------|-------------|--------------|------------|---------------------|-----------| +| 400 | 60 × 2.7ms = 162ms | 8 × 60ms = 480ms | 12 × 1.5ms = 18ms | **660 ms** | ✅ | +| 600 | 90 × 2.7ms = 243ms | 12 × 60ms = 720ms | 18 × 1.5ms = 27ms | **990 ms** | ✅ Tight | +| 800 | 120 × 2.7ms = 324ms | 16 × 60ms = 960ms | 24 × 1.5ms = 36ms | **1,320 ms** | ❌ Over budget | + +**With X3D-S instead of SlowFast (25ms instead of 60ms):** + +| Cameras | RF-DETR (15%) | X3D-S (2%) | YOLOE (3%) | Total GPU 1 time/sec | Feasible? | +|---------|-------------|-----------|------------|---------------------|-----------| +| 600 | 243ms | 12 × 25ms = 300ms | 27ms | **570 ms** | ✅ | +| 800 | 324ms | 16 × 25ms = 400ms | 36ms | **760 ms** | ✅ | +| 1,000 | 405ms | 20 × 25ms = 500ms | 45ms | **950 ms** | ✅ Tight | + +**Final capacity for Scenario (c):** + +| Config | Batch 16 | **Max Cameras** | Bottleneck | +|--------|---------|----------------|------------| +| SlowFast R50, mixed triggers | 16 | **~600** | GPU 1 (SlowFast) | +| X3D-S (recommended), mixed triggers | 16 | **~800–1,000** | GPU 1 (X3D-S) | +| X3D-S, conservative triggers (1%) | 16 | **~1,120** | GPU 0 (Tier 1) | + +### 13.5 Summary — Camera Capacity (2× V100 32 GB, 1 FPS) + +| Scenario | Batch=8 | Batch=16 | Batch=32 | Bottleneck | +|----------|---------|---------|---------|------------| +| **(a) YOLOv26-Pose only** | 840 | 1,120 | 1,330 | GPU 0 compute | +| **(b) + RF-DETR hybrid (20% trigger)** | 840 | 1,120 | 1,330 | GPU 0 compute | +| **(c) Full pipeline + SlowFast** | ~450 | ~600 | ~700 | GPU 1 (SlowFast) | +| **(c) Full pipeline + X3D-S** | ~650 | ~800–1,000 | ~1,100 | GPU 1 (X3D-S) | + +> **Key takeaway:** At 1 FPS, the V100 GPUs are not the bottleneck for scenarios (a) and (b). The real constraints are: +> 1. **CPU decode** (need 64+ cores for 800+ RTSP streams) +> 2. **Network bandwidth** (800 cameras × 1 Mbps = 800 Mbps → need 10–25 GbE) +> 3. **SlowFast latency** in scenario (c) — mitigated by using X3D-S instead +> 4. **MongoDB write throughput** at 2,000+ docs/sec — use async bulk writes + +--- + +## 14. Pipeline Scenario Analysis — YOLO vs RF-DETR with Behavior Detection + +### 14.1 Candidate Pipeline Scenarios + +Two base pipeline architectures are evaluated for integration with behavior detection. Both capture at **1 FPS per camera** and detect **multiple object classes** — not just persons. + +#### Target Detection Classes + +| Category | Objects | Notes | +|----------|---------|-------| +| **Person** | People (pedestrians, staff, intruders) | Primary target for behavior detection | +| **Vehicle** | Car, truck, bus, motorcycle, bicycle | Parking, traffic monitoring, suspicious vehicles | +| **Bag/Luggage** | Backpack, handbag, suitcase | Unattended bag detection, theft evidence | +| **Weapon** | Knife, gun (requires fine-tuning for many weapon types) | Critical security objects | +| **Animal** | Dog, cat, bird, horse, etc. | Stray animal detection, restricted area intrusion | +| **Other** | Umbrella, cell phone, laptop, etc. | Context-dependent objects | + +> **COCO pretrained models** (both YOLO and RF-DETR) detect **80 object classes** out of the box. Custom classes (specific weapon types, uniform types) require fine-tuning. + +#### Scenario 1 — YOLOv26-Pose Pipeline + +``` +Thread per CCTV stream (capture frame at 1 FPS) + → Save frame in shared memory buffer + → Pool/batch frames for YOLO detection pipeline + → Get bounding boxes (object detection: person, vehicle, bag, weapon, animal, etc.) + → Crop bounding box regions and save as images + → Pool/batch cropped images for async save to MongoDB +``` + +**What YOLO Pose provides per frame:** +- Bounding boxes for **all 80 COCO classes** (person, car, truck, dog, backpack, knife, etc.) +- Class labels + confidence scores for every detected object +- **17 body keypoints** (nose, eyes, shoulders, elbows, wrists, hips, knees, ankles) **for person class only** +- Keypoint confidence scores per keypoint + +> **Key point:** YOLOv26-Pose is NOT person-only. It detects all object classes like standard YOLO, but **additionally** outputs body keypoints for every detected person. The pose head adds negligible latency (~0.1 ms). + +#### Scenario 2 — RF-DETR Pipeline + +``` +Thread per CCTV stream (capture frame at 1 FPS) + → Save frame in shared memory buffer + → Pool/batch frames for RF-DETR detection pipeline + → Get bounding boxes (object detection: person, vehicle, bag, weapon, animal, etc.) + → Crop bounding box regions and save as images + → Pool/batch cropped images for async save to MongoDB +``` + +**What RF-DETR provides per frame:** +- Bounding boxes for **all 80 COCO classes** (same classes as YOLO) +- Class labels + confidence scores +- **Superior small object detection** (bags, weapons, distant animals are better detected) +- **No keypoints, no pose data** for any class + +### 14.2 Key Difference — Output Capabilities + +Both models detect the **same object classes**. The difference is what *extra* information they provide: + +| Output | YOLOv26-Pose | RF-DETR | +|--------|-------------|---------| +| **Multi-class detection** (person, vehicle, bag, weapon, animal) | ✅ 80 COCO classes | ✅ 80 COCO classes | +| Bounding boxes + confidence | ✅ | ✅ | +| **17 body keypoints** (person class only) | ✅ | ❌ | +| Keypoint confidence scores | ✅ | ❌ | +| **Small object detection** (< 32×32 px: distant bags, weapons, animals) | Good | **Superior** (+7 AP) | +| Person detection (medium-large) | >90% AP | >93% AP | +| Vehicle detection | >85% AP | >88% AP | +| Bag/weapon detection (typically small) | Moderate | **Better** (deformable attention) | +| Inference latency (V100 TensorRT FP16) | **~1.5 ms** | ~2.7 ms | + +#### Impact on Behavior Detection + +| Behavior | What's Needed | YOLOv26-Pose | RF-DETR | +|----------|--------------|-------------|---------| +| **Fighting** | Person keypoints (arm velocity, proximity) | ✅ Built-in | ❌ Needs second model | +| **Tawuran** | Person keypoints + crowd count | ✅ Built-in | ⚠️ Count only (no pose) | +| **Stealing** | Hand-to-object proximity (keypoints + bag/object bbox) | ✅ Hand keypoints + bag bbox in same pass | ⚠️ Bag bbox only, no hand position | +| **Unattended bag** | Bag bbox without nearby person bbox | ✅ | ✅ (both detect bags) | +| **Suspicious vehicle** | Vehicle bbox + loitering time | ✅ | ✅ (both detect vehicles) | +| **Animal intrusion** | Animal bbox in restricted zone | ✅ | ✅ (both detect animals) | + +**Critical implication:** For behaviors requiring **only bounding boxes** (unattended bag, vehicle loitering, animal intrusion), both models work equally well. For behaviors requiring **body keypoints** (fighting, tawuran, stealing), RF-DETR cannot do it alone — you must add YOLO Pose as a second model, doubling inference cost. + +> **This is why YOLO Pose is recommended as Tier 1:** it handles ALL detection classes AND provides keypoints for person-specific behavior analysis in a single inference pass. RF-DETR's small object advantage is best leveraged on-demand (Tier 2) to confirm small objects like weapons, bags, or distant persons that YOLO may have missed. + +--- + +### 14.3 SlowFast vs YOLO Pose for Behavior Detection at 1 FPS + +#### How SlowFast Works + +SlowFast is a **video-level temporal action recognition** model. It requires a **clip** (sequence of frames) as input: + +| Pathway | Purpose | Typical Sampling | What Happens at 1 FPS | +|---------|---------|-----------------|----------------------| +| **Slow pathway** | Spatial semantics (what is happening) | 8 frames, stride 8 → 2.1s at 30 FPS | 8 frames = **8 seconds** of context | +| **Fast pathway** | Rapid motion capture (how it moves) | 32 frames, stride 2 → 2.1s at 30 FPS | 32 frames = **32 seconds** of context | + +#### The Problem: Fast Pathway is Useless at 1 FPS + +The fast pathway is designed to capture **rapid temporal changes** — a fist swing (~0.3s), a grab (~0.5s), a sudden lunge (~0.2s). At 1 FPS, these events happen **between** frames and are never captured. The fast pathway receives what is essentially a slideshow with no useful motion signal. + +#### Comparison: SlowFast vs YOLO Pose at 1 FPS + +| Factor | YOLO Pose (Recommended) | SlowFast | +|--------|------------------------|----------| +| **Works at 1 FPS?** | ✅ Yes — single-frame keypoints | ❌ Poorly — fast pathway is blind | +| **Latency per inference** | ~1.5 ms (single frame) | ~60 ms (8–32 frame clip) | +| **Memory overhead** | Zero buffering | Must buffer 8–32 frames per tracked person (~2.4 MB/person) | +| **Fighting detection** | Keypoint proximity + inter-frame velocity | Clip-level action class (degraded accuracy at 1 FPS) | +| **Tawuran detection** | Crowd density + centroid convergence | Could classify "crowd violence" but overkill | +| **Stealing detection** | Hand-object proximity heuristic | Needs additional model regardless | +| **Pipeline complexity** | Simple — inline with detection (one model) | Complex — clip buffer, separate inference, temporal alignment | +| **Accuracy at 1 FPS** | Good — poses are spatially informative | **Degraded** — temporal signal too sparse | +| **GPU cost (always-on)** | 1.5 ms/frame | 60 ms/clip + buffering VRAM | + +#### When SlowFast Does Make Sense + +SlowFast becomes valuable only with **temporarily increased frame rate** for suspicious cameras: + +``` +Normal operation: Camera at 1 FPS → YOLO Pose → "suspicious pose detected" +Triggered mode: Camera bumps to 15 FPS for 3 seconds → SlowFast confirms/denies + (45 frames captured → proper temporal signal for both pathways) +``` + +This is a valid **Tier 2 confirmation strategy** but not a primary detector at 1 FPS. + +#### Verdict + +> **At 1 FPS, YOLO Pose is strictly superior to SlowFast for behavior detection.** SlowFast's core architectural advantage (dual-pathway temporal modeling) is neutralized by the sparse frame rate. YOLO Pose provides spatially rich keypoint data on every single frame with no buffering overhead. +> +> **Recommendation:** Use SlowFast only as an optional Tier 2 confirmer when the camera can temporarily increase to 15+ FPS upon trigger. For the always-on pipeline at 1 FPS, rely on YOLO Pose keypoint analysis. + +--- + +### 14.4 RF-DETR as On-Demand (Tier 2) — Rationale + +#### RF-DETR's Small Object Advantage + +RF-DETR uses **deformable attention** that focuses on arbitrary-scale features. This gives it a significant edge on small objects: + +| Model | AP_small (< 32×32 px) | AP_medium | AP_large | +|-------|----------------------|----------|---------| +| RF-DETR-M | **~37** | ~58 | ~71 | +| YOLOv26-S | ~22 | ~52 | ~68 | +| YOLOv26-M | ~30 | ~56 | ~70 | + +#### Why Small Object Advantage Doesn't Help for Always-On Behavior Detection + +COCO "small" = **< 32×32 pixels**. In surveillance terms: + +| Camera Setup | Person Size | COCO Category | Can Analyze Behavior? | +|-------------|------------|---------------|----------------------| +| 1080p, person at 5m | ~300×600 px | Large | ✅ Yes — full pose visible | +| 1080p, person at 15m | ~100×200 px | Medium | ✅ Yes — keypoints detectable | +| 1080p, person at 30m | ~50×100 px | Medium | ⚠️ Limited — coarse pose only | +| 1080p, person at 50m+ | ~25×50 px | **Small** | ❌ No — blurry blob, no useful pose | + +> **Key insight:** If a person is small enough that YOLO misses it but RF-DETR detects it, that person is too small for behavior analysis anyway. There aren't enough pixels to determine fighting, stealing, or tawuran. + +#### Where RF-DETR's Strength IS Valuable (On-Demand) + +RF-DETR excels when triggered to confirm specific events: + +| On-Demand Task | Why RF-DETR is Better | +|---------------|----------------------| +| **Stolen object detection** | Bags, phones, wallets are small objects — RF-DETR's sweet spot | +| **Weapon detection** | Knives, guns are small — RF-DETR detects more reliably | +| **Precise person-object interaction** | Higher AP means fewer false negatives on critical frames | +| **Distant crowd counting** | For tawuran, counting heads at distance where YOLO may miss some | + +#### Cost Analysis: Always-On vs On-Demand + +| RF-DETR Mode | Per-Frame Cost | At 800 Cameras | GPU Utilization | +|-------------|---------------|----------------|----------------| +| **Always-on (every frame)** | 2.7 ms/frame | 2,160 ms/sec (needs full GPU) | ~100% of GPU 1 | +| **On-demand (5% trigger rate)** | 2.7 ms × 5% = **0.14 ms/frame** | 108 ms/sec | **~5% of GPU 1** | +| **On-demand (20% trigger rate)** | 2.7 ms × 20% = **0.54 ms/frame** | 432 ms/sec | **~20% of GPU 1** | + +Running RF-DETR on-demand at a 5% trigger rate uses **~95% less GPU** than always-on, while still providing high-accuracy confirmation exactly when it matters. + +#### The On-Demand Trigger Flow + +``` +YOLO Pose (always-on, every frame, ~1.5 ms) + │ + ├── 95% of frames: normal activity + │ └── Crop → MongoDB (done, no Tier 2 needed) + │ + └── 5% of frames: suspicious activity detected + │ (e.g., aggressive pose, hand near another's bag, + │ abnormal crowd convergence) + │ + └── RF-DETR (on-demand, ~2.7 ms, this frame only) + │ + ├── Confirm: stolen object in hand? + ├── Confirm: weapon detected? + ├── Confirm: precise person count in crowd? + │ + └── If confirmed → Alert + Evidence saved + If denied → Suppress false alarm +``` + +> **Verdict:** RF-DETR should be **on-demand (Tier 2)**, not always-on. Its small object strength is most valuable for confirming specific detections (stolen objects, weapons), not for primary surveillance scanning. Running it always-on wastes GPU and provides no behavior detection capability without a second model. + +--- + +### 14.5 Recommended Architecture — Final Pipeline + +Based on the analysis in Sections 14.1–14.4, the recommended architecture is: + +- **Tier 1 (always-on):** YOLOv26-S-Pose — detection + pose in a single 1.5 ms pass +- **Tier 2 (on-demand):** RF-DETR-M — high-accuracy confirmation for suspicious events +- **Behavior analysis:** CPU-side keypoint math (fighting, tawuran) + RF-DETR object confirmation (stealing) +- **SlowFast:** Optional Tier 2 only if camera can temporarily increase to 15+ FPS + +#### Complete Pipeline Flowchart + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ CAPTURE LAYER │ +│ │ +│ Thread per CCTV stream (1 FPS capture) │ +│ → RTSP connection (TCP, persistent) │ +│ → NVDEC hardware decode (cameras 1–48 on GPU, rest on CPU) │ +│ → Frame saved to shared memory ring buffer (3 slots per camera) │ +│ → Backpressure: drop-oldest if buffer full │ +└──────────────────────────────┬──────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ TIER 1 — ALWAYS-ON (GPU 0, every frame, every camera) │ +│ │ +│ Pool/batch frames (batch_size=16) │ +│ → YOLOv26-S-Pose inference (~1.5 ms/frame, TensorRT FP16) │ +│ → Output per frame: │ +│ • Bounding boxes for ALL classes (person, vehicle, bag, animal...) │ +│ • Class labels + confidence scores for every detected object │ +│ • 17 body keypoints per PERSON (with confidence per keypoint) │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ OBJECT-LEVEL PROCESSING (all detected objects) │ │ +│ │ │ │ +│ │ For ALL classes: │ │ +│ │ • Crop bounding box → JPEG encode → batch for MongoDB │ │ +│ │ • Store metadata: class, confidence, bbox, camera_id, timestamp │ │ +│ │ │ │ +│ │ For VEHICLES: │ │ +│ │ • Track vehicle bbox across frames (loitering detection) │ │ +│ │ • Restricted zone violation (bbox inside forbidden region) │ │ +│ │ │ │ +│ │ For BAGS/LUGGAGE: │ │ +│ │ • Unattended bag: bag bbox with no person bbox within radius │ │ +│ │ │ │ +│ │ For ANIMALS: │ │ +│ │ • Restricted area intrusion: animal bbox in forbidden zone │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ PERSON-SPECIFIC BEHAVIOR ANALYSIS (CPU, ~0.01 ms/person) │ │ +│ │ (uses keypoints — only available for person class) │ │ +│ │ │ │ +│ │ FightingDetector: │ │ +│ │ • Keypoint velocity between consecutive frames │ │ +│ │ • Inter-person proximity (wrist-to-head distance < threshold) │ │ +│ │ • Aggressive pose classification (raised arms, lunging torso) │ │ +│ │ │ │ +│ │ TawuranDetector: │ │ +│ │ • Crowd density (persons per m² exceeds threshold) │ │ +│ │ • Centroid convergence (people moving toward each other) │ │ +│ │ • Collective aggression score (% of crowd with fighting pose) │ │ +│ │ │ │ +│ │ StealingHeuristic (pre-screen): │ │ +│ │ • Hand keypoint proximity to detected bag/object bbox │ │ +│ │ • Unusual hand-to-bag/pocket trajectory │ │ +│ │ • Person-to-person hand interaction near object │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +└──────────────┬──────────────────────────────────┬──────────────────────────┘ + │ │ + Normal (95%) Suspicious (5%) + │ │ + ▼ ▼ +┌──────────────────────────────┐ ┌──────────────────────────────────────────────┐ +│ STANDARD OUTPUT │ │ TIER 2 — ON-DEMAND (GPU 1, triggered only) │ +│ │ │ │ +│ Crop ALL detected objects │ │ RF-DETR-M inference (~2.7 ms/frame) │ +│ → JPEG encode │ │ → High-accuracy re-detection of scene │ +│ → Batch (100–500 docs) │ │ → Small object detection (bags, weapons, │ +│ → Async MongoDB write │ │ phones, knives — RF-DETR excels here) │ +│ (motor driver) │ │ → Detects objects YOLO may have missed │ +│ │ │ │ +│ Metadata saved per object: │ │ Confirmation Logic: │ +│ • camera_id │ │ • Stealing: object in hand + proximity │ +│ • timestamp │ │ • Weapon: object class = knife/gun/etc │ +│ • object_class (person, │ │ • Unattended bag: re-confirm no owner │ +│ vehicle, bag, animal...) │ │ • False alarm: RF-DETR sees no anomaly │ +│ • bbox coordinates │ │ → Suppress alert, save as normal frame │ +│ • confidence score │ │ │ +│ • person_count (per frame) │ │ If confirmed: │ +│ • vehicle_count │ │ → Alert (webhook / push notification) │ +│ • keypoints (person only) │ │ → Evidence clip saved (10-sec buffer) │ +│ │ │ → Behavior metadata → MongoDB │ +└──────────────────────────────┘ └──────────────────────────────────────────────┘ +``` + +#### Processing Timeline (single frame, single camera) + +``` +Time (ms): 0 1.5 1.51 1.52 4.22 + │ │ │ │ │ + ▼ ▼ ▼ ▼ ▼ + YOLOv26 Result Behavior Crop+Save RF-DETR + -Pose ready Analysis to MongoDB (only if + starts (CPU, (async, suspicious) + ~0.01ms) non-blocking) + +Normal frame total: ~1.52 ms (Tier 1 only) +Suspicious frame: ~4.22 ms (Tier 1 + Tier 2) +Blended average: ~1.52 + (0.05 × 2.7) = ~1.66 ms/frame +``` + +#### GPU Assignment + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ GPU 0 (V100 #1) — TIER 1: Always-On Detection + Pose │ +│ │ +│ NVDEC: cameras 1–24 (H.264 → GPU memory, zero-copy) │ +│ Model: YOLOv26-S-Pose (TensorRT FP16, 0.8 GB) │ +│ Batch: 16 frames → 16 × 1.5 ms = 24 ms → 667 cameras/sec │ +│ VRAM: 2.6 GB used / 29.4 GB free │ +└─────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────┐ +│ GPU 1 (V100 #2) — TIER 2: On-Demand Confirmation │ +│ │ +│ NVDEC: cameras 25–48 (H.264 → GPU memory, zero-copy) │ +│ Model: RF-DETR-M (TensorRT FP16, 2.5 GB) │ +│ Load: 5% of frames → ~33–56 frames/sec (at 667–1120 cameras) │ +│ VRAM: 4.4 GB used / 27.6 GB free │ +│ │ +│ ⚡ ~95% idle — available for future models or scaling │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +### 14.6 Hardware Requirements — Scenario Comparison + +#### Scenario 1: YOLOv26-Pose (Tier 1) + RF-DETR On-Demand (Tier 2) — RECOMMENDED + +This is the full behavior detection pipeline with NVDEC zero-copy decode. + +| Component | Specification | Purpose | Est. Cost (USD) | +|-----------|--------------|---------|-----------------:| +| **CPU** | AMD EPYC 9374F (32C/64T, 3.85 GHz) | RTSP I/O, overflow decode (~0.09 core/cam with NVDEC), ByteTrack, behavior analysis | $2,500–3,000 | +| **Motherboard** | Supermicro H13SSL-N (SP5 socket) | Single-socket EPYC, 2× PCIe Gen5 x16 for dual V100 | $500–700 | +| **RAM** | 128 GB DDR5-4800 ECC RDIMM (4×32 GB) | Ring buffers, batch queues, tracking state, OS overhead | $400–500 | +| **GPU 0** | NVIDIA V100 32 GB *(already owned)* | Tier 1: NVDEC (24 streams) + YOLOv26-S-Pose (2.6 GB VRAM used) | — | +| **GPU 1** | NVIDIA V100 32 GB *(already owned)* | Tier 2: NVDEC (24 streams) + RF-DETR-M on-demand (4.4 GB VRAM used) | — | +| **NIC** | Mellanox ConnectX-5 25 GbE (dual-port) | 800 cameras × ~1 Mbps = 800 Mbps sustained | $150–200 | +| **Boot SSD** | 512 GB NVMe (PCIe Gen4) | OS, CUDA toolkit, Python env, model weights (~5 GB) | $50–70 | +| **Data SSD** | 2 TB NVMe (Samsung 990 Pro) | MongoDB data dir + alert clip buffer | $150–200 | +| **Archive HDD** | 2× 8 TB HDD (RAID 1) | Long-term evidence storage | $240–300 | +| **PSU** | 1,200W 80+ Platinum | 2× V100 (300W) + EPYC (320W) + overhead | $200–300 | +| **Chassis** | 4U rackmount server | Dual GPU cooling, sufficient airflow | $300–500 | +| **UPS** | 1,500VA online UPS | Power protection during writes | $300–400 | +| | | | | +| **Total (excl. V100s)** | | | **$4,800–6,200** | +| **Total (incl. V100s at ~$3K each)** | | | **$10,800–12,200** | + +**Capacity:** 800–1,120 cameras at 1 FPS with behavior detection (fighting, stealing, tawuran). + +#### Scenario 2: RF-DETR Always-On (No Behavior Detection) + +Detection + crop + save only. No pose, no behavior analysis, no Tier 2. + +| Component | Specification | Purpose | Est. Cost (USD) | +|-----------|--------------|---------|-----------------:| +| **CPU** | AMD EPYC 9274F (24C/48T, 4.05 GHz) | RTSP I/O, overflow decode, JPEG encode, MongoDB writes | $1,500–2,000 | +| **Motherboard** | Supermicro H13SSL-N (SP5 socket) | Single-socket EPYC | $500–700 | +| **RAM** | 64 GB DDR5-4800 ECC RDIMM (2×32 GB) | Ring buffers only (no clip buffers, no tracking state) | $200–250 | +| **GPU** | 1× NVIDIA V100 32 GB *(already owned)* | NVDEC (24 streams) + RF-DETR-M always-on (4.4 GB VRAM used) | — | +| **NIC** | Intel X710-DA2 10 GbE (dual-port) | 500 cameras × ~1 Mbps = sufficient | $80–120 | +| **Boot SSD** | 512 GB NVMe | OS + model weights | $50–70 | +| **Data SSD** | 2 TB NVMe | MongoDB data directory | $150–200 | +| **Archive HDD** | 1× 8 TB HDD | Crop storage (optional) | $120–150 | +| **PSU** | 850W 80+ Gold | 1× V100 (300W) + EPYC (250W) + overhead | $120–150 | +| **Chassis** | 4U rackmount or tower workstation | Single GPU cooling | $200–400 | +| **UPS** | 1,000VA online UPS | Power protection | $200–250 | +| | | | | +| **Total (excl. V100)** | | | **$3,100–4,300** | +| **Total (incl. 1× V100 at ~$3K)** | | | **$6,100–7,300** | + +**Capacity:** ~370 cameras at 1 FPS (RF-DETR-M at 2.7 ms/frame, batch=16). Detection + crop only, no behavior detection. + +> **Note on Scenario 2 throughput:** RF-DETR is 1.8× slower than YOLOv26-S per frame (2.7 ms vs 1.5 ms). If you use a single V100 with RF-DETR always-on, maximum cameras drops from ~667 (YOLO) to **~370** per GPU. Using both V100s for RF-DETR would recover to ~740, but then you have no GPU for Tier 2 confirmation. + +--- + +### 14.7 Side-by-Side Comparison + +| Specification | **Scenario 1 (Recommended)** | **Scenario 2** | +|--------------|------------------------------|----------------| +| | *YOLO Pose + RF-DETR on-demand* | *RF-DETR always-on, no behavior* | +| | | | +| **Primary model** | YOLOv26-S-Pose (1.5 ms) | RF-DETR-M (2.7 ms) | +| **Secondary model** | RF-DETR-M (on-demand, 5%) | None | +| **Detection classes** | 80 COCO (person, vehicle, bag, animal, etc.) | 80 COCO (same classes) | +| **Keypoints (person)** | ✅ 17 body keypoints per person | ❌ None | +| **Behavior analysis** | CPU keypoint math | ❌ Not available | +| | | | +| **CPU** | EPYC 9374F (32C/64T) | EPYC 9274F (24C/48T) | +| **RAM** | 128 GB DDR5 ECC | 64 GB DDR5 ECC | +| **GPUs active** | 2× V100 32 GB | 1× V100 32 GB | +| **NIC** | 25 GbE | 10 GbE | +| **PSU** | 1,200W Platinum | 850W Gold | +| **System power** | ~1,000W sustained | ~600W sustained | +| | | | +| **Max cameras (1 FPS)** | **800–1,120** | **~370** (1 GPU) / ~740 (2 GPU) | +| **Small object detection** (bags, weapons, animals at distance) | On-demand via RF-DETR (Tier 2) | ✅ Always-on | +| **Person detection AP** | ~90%+ (YOLO, sufficient) | ~93%+ (RF-DETR, higher) | +| **Vehicle detection** | ✅ (large objects, both models excel) | ✅ | +| **Bag/weapon detection** | ✅ YOLO detects + RF-DETR confirms small ones | ✅ Better for small/distant objects | +| **Animal detection** | ✅ | ✅ | +| | | | +| **Fighting detection** | ✅ Keypoint velocity + proximity | ❌ No keypoints | +| **Tawuran detection** | ✅ Crowd density + convergence | ❌ No keypoints | +| **Stealing detection** | ✅ Hand-to-bag keypoint + RF-DETR confirm | ❌ No hand position data | +| **Unattended bag** | ✅ Bag bbox + no nearby person bbox | ✅ Same logic, slightly better bag AP | +| **Vehicle loitering** | ✅ Track vehicle bbox across frames | ✅ Same capability | +| **Animal intrusion** | ✅ Animal bbox in restricted zone | ✅ Same capability | +| **Weapon detection** | ⚠️ YOLO detects + RF-DETR Tier 2 confirms | ✅ Better small weapon AP always-on | +| **False alarm suppression** | ✅ RF-DETR Tier 2 confirmation | ❌ No confirmation layer | +| | | | +| **Cost (excl. V100s)** | **$4,800–6,200** | **$3,100–4,300** | +| **Cost (incl. V100s)** | **$10,800–12,200** | **$6,100–7,300** | +| **Cost difference** | — | Saves $4,700–4,900 | +| | | | +| **Best for** | Full surveillance: multi-class detection + behavior analysis | Simple multi-class detection + crop + save | + +### 14.8 Final Recommendation + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ │ +│ ✅ RECOMMENDED: Scenario 1 — YOLOv26-Pose (Tier 1) + RF-DETR (Tier 2) │ +│ │ +│ • Detects ALL object classes (person, vehicle, bag, weapon, animal) │ +│ • ADDITIONALLY provides 17 body keypoints for every detected person │ +│ • One model handles multi-class detection + behavior analysis in ~1.5 ms │ +│ • RF-DETR confirms suspicious events on-demand (~5% of frames) │ +│ → excels at confirming small objects: bags, weapons, distant animals │ +│ • 800–1,120 cameras on 2× V100 at 1 FPS │ +│ • Full fighting, stealing, and tawuran detection │ +│ • Bbox-only behaviors (unattended bag, vehicle loitering, animal │ +│ intrusion) work with both models — no keypoints needed │ +│ • GPU 1 is ~95% idle → room for future expansion │ +│ │ +│ ❌ NOT RECOMMENDED: SlowFast at 1 FPS │ +│ • Fast pathway receives no useful temporal signal at 1 FPS │ +│ • Use only as optional Tier 2 with temporary FPS increase (15+ FPS) │ +│ │ +│ ❌ NOT RECOMMENDED: RF-DETR as always-on primary │ +│ • Detects all classes well, but NO keypoints for any class │ +│ • Cannot detect fighting/tawuran/stealing without adding YOLO Pose │ +│ • 1.8× slower → fewer cameras per GPU │ +│ • Small object advantage is valuable but better leveraged on-demand │ +│ • Best role: Tier 2 confirmer for stolen objects / weapons / small items │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/e2e_test_output/docling/BOOKRAG_VLDB_2026_full_merged_content.json b/e2e_test_output/docling/BOOKRAG_VLDB_2026_full_merged_content.json new file mode 100644 index 0000000..fede754 --- /dev/null +++ b/e2e_test_output/docling/BOOKRAG_VLDB_2026_full_merged_content.json @@ -0,0 +1,3141 @@ +[ + { + "type": "text", + "text": "BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents", + "text_level": 0, + "page_idx": 0, + "pdf_id": 0, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Shu Wang The Chinese University of Hong Kong, Shenzhen shuwang3@link.cuhk.edu.cn", + "text_level": -1, + "page_idx": 0, + "pdf_id": 1, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Yingli Zhou The Chinese University of Hong Kong, Shenzhen yinglizhou@link.cuhk.edu.cn", + "text_level": -1, + "page_idx": 0, + "pdf_id": 2, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Yixiang Fang The Chinese University of Hong Kong, Shenzhen fangyixiang@cuhk.edu.cn", + "text_level": -1, + "page_idx": 0, + "pdf_id": 3, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "ABSTRACT", + "text_level": 0, + "page_idx": 0, + "pdf_id": 4, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "As an effective method to boost the performance of Large Language Models (LLMs) on the question answering (QA) task, RetrievalAugmented Generation (RAG), which queries highly relevant information from external complex documents, has attracted tremendous attention from both industry and academia. Existing RAG approaches often focus on general documents, and they overlook the fact that many real-world documents (such as books, booklets, handbooks, etc.) have a hierarchical structure, which organizes their content from different granularity levels, leading to poor performance for the QA task. To address these limitations, we introduce BookRAG, a novel RAG approach targeted for documents with a hierarchical structure, which exploits logical hierarchies and traces entity relations to query the highly relevant information. Specifically, we build a novel index structure, called BookIndex, by extracting a hierarchical tree from the document, which serves as the role of its table of contents, using a graph to capture the intricate relationships between entities, and mapping entities to tree nodes. Leveraging the BookIndex, we then propose an agent-based query method inspired by the Information Foraging Theory, which dynamically classifies queries and employs a tailored retrieval workflow. Extensive experiments on three widely adopted benchmarks demonstrate that BookRAG achieves state-of-the-art performance, significantly outperforming baselines in both retrieval recall and QA accuracy while maintaining competitive efficiency.", + "text_level": -1, + "page_idx": 0, + "pdf_id": 5, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "PVLDB Reference Format:", + "text_level": 0, + "page_idx": 0, + "pdf_id": 6, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Shu Wang, Yingli Zhou, and Yixiang Fang. BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents. PVLDB, 19(1): XXX-XXX, 2025. doi:XX.XX/XXX.XX", + "text_level": -1, + "page_idx": 0, + "pdf_id": 7, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "PVLDB Artifact Availability:", + "text_level": 0, + "page_idx": 0, + "pdf_id": 8, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "The source code, data, and/or other artifacts have been made available at https://github.com/sam234990/BookRAG.", + "text_level": -1, + "page_idx": 0, + "pdf_id": 9, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "1 INTRODUCTION", + "text_level": 0, + "page_idx": 0, + "pdf_id": 10, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Large Language Models (LLMs) such as Qwen 3 [60] and Gemini 2.5 [13] have revolutionized the Question Answering (QA) system [15, 61, 65]. The industry has increasingly adopted LLMs to build QA systems that assist users and reduce manual effort in", + "text_level": -1, + "page_idx": 0, + "pdf_id": 11, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "This work is licensed under the Creative Commons BY-NC-ND 4.0 International License. Visit https://creativecommons.org/licenses/by-nc-nd/4.0/ to view a copy of this license. For any use beyond those covered by this license, obtain permission by emailing info@vldb.org. Copyright is held by the owner/author(s). Publication rights licensed to the VLDB Endowment.", + "text_level": -1, + "page_idx": 0, + "pdf_id": 12, + "middle_json": { + "docling_label": "footnote" + } + }, + { + "type": "text", + "text": "Proceedings of the VLDB Endowment, Vol. 19, No. 1 ISSN 2150-8097. doi:XX.XX/XXX.XX", + "text_level": -1, + "page_idx": 0, + "pdf_id": 13, + "middle_json": { + "docling_label": "footnote" + } + }, + { + "type": "text", + "text": "Figure 1: Comparison of existing methods and BookRAG for complex document QA.", + "text_level": -1, + "page_idx": 0, + "pdf_id": 14, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "image", + "text": "", + "text_level": -1, + "page_idx": 0, + "pdf_id": 15, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-1.png", + "image_caption": [ + "cref='#/texts/14'" + ], + "image_footnote": [], + "middle_json": { + "docling_label": "picture" + } + }, + { + "type": "text", + "text": "many applications [65, 67], such as financial auditing [29, 37], legal compliance [8], and scientific discovery [56]. However, directly relying on LLMs may lead to missing domain knowledge and generating outdated or unsupported information. To address these issues, Retrieval-Augmented Generation (RAG) has been widely adopted [17, 22] by retrieving relevant domain knowledge from external sources and using it to guide the LLM during response generation. On the other hand, in real-world enterprise scenarios, domain knowledge is often stored in long-form documents, such as technical handbooks, API reference manuals, and operational guidebooks [49]. A notable feature of such documents is that they follow the structure of books, characterized by intricate layouts and rigorous logical hierarchies (e.g., explicit tables of contents, nested chapters, and multi-level sections). In this paper, we aim to design an effective RAG system for QA over long and highly structured documents.", + "text_level": -1, + "page_idx": 0, + "pdf_id": 16, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "· Prior works. The existing RAG approaches for documentlevel QA generally fall into two paradigms, as illustrated in Figure 1. The first paradigm relies on OCR (Optical Character Recognition) to convert the document into plain text, after which any text-based RAG method can be directly applied. Among text-based RAG methods, state-of-the-art approaches increasingly adopt graph-based RAG [6, 62, 66], where graph data serves as an external knowledge source because it captures rich semantic information and the", + "text_level": -1, + "page_idx": 0, + "pdf_id": 17, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "Table 1: Comparison of representative methods and our BookRAG.", + "text_level": -1, + "page_idx": 1, + "pdf_id": 18, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "table", + "text": "", + "text_level": -1, + "page_idx": 1, + "pdf_id": 19, + "img_path": "", + "table_caption": [ + "cref='#/texts/17'" + ], + "table_footnote": [], + "table_body": "| Type | Representative Method | Core Feature | Multi-hop Reasoning | Document Parsing | Query Workflow |\n|------------------|---------------------------|--------------------------------------------------------------|-----------------------|--------------------|------------------|\n| Graph-based | RAPTOR [45] GraphRAG [16] | Recursive summarization | | | Static |\n| Layout segmented | MM-Vanilla | Global community detection | | | Static |\n| Layout segmented | DocETL [47] | Multi-modal retrieval LLM-based document processing pipeline | | | Static Manual |\n| Doc-Native | BookRAG (Ours) | Structure-award Index & Agent-based retrieval | | | Dynamic |", + "middle_json": { + "docling_label": "table" + } + }, + { + "type": "text", + "text": "relational structure between entities. As shown in Table 1, two representative methods are GraphRAG [16] and RAPTOR [45]. Specifically, GraphRAG first constructs a knowledge graph (KG) from the textual corpus, and then applies the Leiden community detection algorithm [51] to obtain hierarchical clusters. Summaries are generated for each community, providing a comprehensive, global overview of the entire corpus. RAPTOR builds a recursive tree structure by iteratively clustering document chunks and summarizing them at each level, enabling the model to capture both fine-grained and high-level semantic information across the corpus.", + "text_level": -1, + "page_idx": 1, + "pdf_id": 20, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "In contrast, the second paradigm, layout-aware segmentation [5, 52], first parses the document into structured blocks that preserve the original layout and information of the document, such as paragraphs, tables, figures, or equations. By doing so, it not only avoids the fixed chunk size used in the first paradigm, which often leads to fragmented information, but also retains document-native structural information. These blocks often exhibit multimodal characteristics, and a typical approach is to apply multimodal retrieval to obtain relevant content for answering queries. Recently, a state-ofthe-art method in this category, DocETL [47], provides a declarative interface that allows users to manually define LLM-based processing pipelines to analyze the retrieved blocks. These pipelines consist of LLM-powered operations combined with task-specific optimizations.", + "text_level": -1, + "page_idx": 1, + "pdf_id": 21, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "· Limitations of existing works. However, these methods suffer from two fundamental limitations ( L for short): L1: Failure to capture the deep connection of document structure and semantics. Text-based approaches cannot capture the structural layout of the document, resulting in the loss of important relationships stored in the hierarchical blocks, such as tables nested within a specific section. While layout-segmented methods preserve document structure, they cannot capture the relationships between different blocks in the document, which limits their capability for multi-hop reasoning across these blocks and ultimately affects their overall performance. L2: Static of query workflows. In real-world QA scenarios, user queries are highly heterogeneous, ranging from simple keyword lookups to complex multi-hop questions that require synthesizing evidence scattered across different parts of the document. Applying a uniform strategy, such as static or manually predefined workflows, to diverse needs is inefficient; for example, complex queries often require question decomposition, whereas simple queries do not.", + "text_level": -1, + "page_idx": 1, + "pdf_id": 22, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· Our technical contributions. To bridge this gap, we introduce BookRAG , the first retrieval-augmented generation method built upon a document-native BookIndex , designed to document", + "text_level": -1, + "page_idx": 1, + "pdf_id": 23, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "QA tasks. Specifically, to capture the deep connection of the relation in the document, BookIndex organizes information through two complementary structures. First, to preserve the document's native logical hierarchy, we organize the parsed content blocks into a hierarchical tree structure, which serves as the role of its table of contents. Second, to capture the intricate relations within these blocks, we construct a KG containing fine-grained entities. Finally, we unify these two structures by mapping the KG entities to their corresponding tree nodes.", + "text_level": -1, + "page_idx": 1, + "pdf_id": 24, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "However, effective multi-hop reasoning on the graph relies on a high-quality KG [62, 66], which is often compromised by entity ambiguity (e.g., distinct entities with names like 'LLM' and 'Large Language Model'). To address this, we propose a novel gradient-based entity resolution method that analyzes the similarity distribution of candidate entities. By identifying sharp drops in similarity scores, we can efficiently distinguish and merge coreferent entities, thereby ensuring graph connectivity and enhancing reasoning capabilities.", + "text_level": -1, + "page_idx": 1, + "pdf_id": 25, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Building upon the BookIndex, we address the static of query workflows ( L2 ) by implementing an agent-based retrieval . Specifically, our agent first classifies user queries based on their intent and complexity, and then dynamically generates tailored retrieval workflows. Grounded in Information Foraging Theory [42], our retrieval process mimics foraging by using Selector to narrow down the search space via information scents and Reasoner to locate highly relevant evidence.", + "text_level": -1, + "page_idx": 1, + "pdf_id": 26, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "We conduct extensive experiments on three widely adopted datasets to validate the effectiveness and efficiency of our BookRAG, comparing it against several state-of-the-art baselines. The experimental results demonstrate that BookRAG consistently achieves superior performance in both retrieval recall and QA accuracy across all datasets. Furthermore, our detailed analysis validates the critical contributions of our key features, such as the high-quality KG and the agent-based retrieval mechanism.", + "text_level": -1, + "page_idx": 1, + "pdf_id": 27, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "We summarize our contributions as:", + "text_level": -1, + "page_idx": 1, + "pdf_id": 28, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "· We introduce BookRAG , a novel method that constructs a document-native BookIndex by integrating a hierarchical tree of document layout blocks with a KG storing finegrained entity relations.", + "text_level": -1, + "page_idx": 1, + "pdf_id": 29, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· We propose an Agent-based Retrieval approach inspired by Information Foraging Theory, which dynamically classifies queries and configures optimal retrieval workflows to locate highly relevant evidence within documents.", + "text_level": -1, + "page_idx": 1, + "pdf_id": 30, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· Extensive experiments on multiple benchmarks show that BookRAG significantly outperforms existing baselines, attaining state-of-the-art performance in solving complex", + "text_level": -1, + "page_idx": 1, + "pdf_id": 31, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "2", + "text_level": -1, + "page_idx": 1, + "pdf_id": 32, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "document QA tasks while maintaining competitive efficiency.", + "text_level": -1, + "page_idx": 2, + "pdf_id": 33, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Outline. We review related work in Section 2. Section 3 introduces the problem formulation, IFT, and RAG workflow. In Section 4, we present the structure of our BookIndex and its construction. Section 5 presents our agent-based retrieval, elaborating on the query classification and operators used in the structured execution of BookRAG. We present the experimental results and detailed analysis in Section 6, and conclude the paper in Section 7.", + "text_level": -1, + "page_idx": 2, + "pdf_id": 34, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "2 RELATED WORK", + "text_level": 0, + "page_idx": 2, + "pdf_id": 35, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "In this section, we review the related works, including LLM in document analysis and the modern representative RAG approaches.", + "text_level": -1, + "page_idx": 2, + "pdf_id": 36, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "· LLM in document analysis. Recent advances in LLMs have offered opportunities to leverage LLMs in document data analysis. Due to the robust semantic reasoning capabilities of LLMs, there is an increasing number of works focusing on transferring unstructured documents (e.g., HTML, PDFs, and raw text) into structured formats, such as relational tables [1, 7, 25, 38]. For example, Evaporate [1] utilizes LLMs to synthesize extraction code, enabling cost-effective conversion of semi-structured web documents into structured databases without heavy manual annotation. In addition, several LLM-based document analysis systems have been proposed to equip standard data pipelines with semantic understanding [28, 40, 47, 53]. For instance, LOTUS [40] extends the relational model with semantic operators, allowing users to execute SQL-like queries with LLM-powered predicates (e.g., filter, join) over unstructured text corpora. Similarly, DocETL [47] introduces an agentic framework to optimize complex information extraction tasks. Furthermore, another line of research proposes to directly analyze or parse documents by viewing the document pages as images, thereby preserving critical layout and visual information [26, 31, 54].", + "text_level": -1, + "page_idx": 2, + "pdf_id": 37, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· RAG approaches. RAG has been proven to excel in many tasks, including open-ended question answering [24, 48], programming context [9, 10], SQL rewrite [30, 50], and data cleaning [35, 36, 43]. The naive RAG technique relies on retrieving query-relevant contexts from external knowledge bases to mitigate the 'hallucination' of LLMs. Recently, many RAG approaches [16, 18, 19, 21, 27, 32, 32, 45, 55, 58, 66] have adopted graph structures to organize the information and relationships within documents, achieving improved overall retrieval performance. For more details, please refer to the recent survey of graph-based RAG methods [41]. Besides, the Agentic RAG paradigm has been widely studied, employing autonomous agents to dynamically orchestrate and refine the RAG pipeline, thus significantly boosting the reasoning robustness and generation fidelity [2, 23, 59].", + "text_level": -1, + "page_idx": 2, + "pdf_id": 38, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "3 PRELIMINARIES", + "text_level": 0, + "page_idx": 2, + "pdf_id": 39, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "This section formalizes the research problem of complex document QA, introduces the foundational Information Foraging Theory (IFT), and briefly reviews the general workflow of RAG systems", + "text_level": -1, + "page_idx": 2, + "pdf_id": 40, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "3.1 Problem Formulation", + "text_level": 0, + "page_idx": 2, + "pdf_id": 41, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "We study the problem of Question Answering (QA) over complex documents, which aims to answer user queries based on long-form", + "text_level": -1, + "page_idx": 2, + "pdf_id": 42, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "3", + "text_level": -1, + "page_idx": 2, + "pdf_id": 43, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "documents [5, 11, 33]. Formally, a document 𝐷 is represented as a sequence of 𝑁 pages, 𝐷 = { 𝑃 𝑖 } 𝑁 𝑖 = 1 . These pages collectively contain a sequence of content blocks B = { 𝑏 𝑗 } 𝑀 𝑗 = 1 , where each block 𝑏 𝑗 represents a distinct element (e.g., text segment, section header, table, or image) organized within a logical chapter hierarchy. Given a user query 𝑞 , the goal is to generate an accurate answer 𝐴 , ideally grounded in a specific set of evidence blocks 𝐸 ⊂ B . The task is formulated as developing a method S that maps the structured document and the query to the final answer:", + "text_level": -1, + "page_idx": 2, + "pdf_id": 44, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "equation", + "text": "𝐴 = S( 𝐷,𝑞 ) (1)", + "text_level": -1, + "page_idx": 2, + "pdf_id": 45, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "where S should navigate both the sequential page content and the logical hierarchy of 𝐷 to synthesize the response.", + "text_level": -1, + "page_idx": 2, + "pdf_id": 46, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "3.2 Information Foraging Theory", + "text_level": 0, + "page_idx": 2, + "pdf_id": 47, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Information Foraging Theory (IFT) [42] provides a framework for understanding information access as a process analogous to animal foraging. It suggests that users follow cues, known as information scent (e.g., keywords or icons), to navigate between clusters of content, known as information patches (e.g., sections in handbooks). The goal is to maximize the rate of valuable information gain while minimizing effort, guiding the decision to either stay within a patch or seek a new one.", + "text_level": -1, + "page_idx": 2, + "pdf_id": 48, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Consider experts seeking a solution to a specific problem within a large technical handbook. They first extract key terms related to the problem, which act as information scent. This scent guides them to navigate towards one or more promising sections (the information patches). Within these patches, they analyze the diverse content to extract the precise knowledge required to formulate a final answer", + "text_level": -1, + "page_idx": 2, + "pdf_id": 49, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "3.3 RAG workflow", + "text_level": 0, + "page_idx": 2, + "pdf_id": 50, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Retrieval-Augmented Generation (RAG) systems typically operate in a two-phase framework [6, 16, 41]. In the Offline Indexing phase, unstructured corpus data is organized into a structured index, which can take various forms such as vector databases or KG [66]. Subsequently, in the Online Retrieval phase, the system retrieves relevant components (e.g., text chunks or subgraphs) based on the user query 𝑞 to inform the LLM's generation. However, these general workflows often treat the index as a structure derived purely from content, potentially detaching it from the document's original logical hierarchy. In contrast, our approach seeks to deeply integrate these retrieval structures with the document's native tree topology.", + "text_level": -1, + "page_idx": 2, + "pdf_id": 51, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "4 BOOKINDEX", + "text_level": 0, + "page_idx": 2, + "pdf_id": 52, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "This section introduces our proposed BookIndex , a hierarchical structure-aware index designed to capture both the explicit logical hierarchy and the intricate entity relations within complex documents. We first formally define the structure of the BookIndex ( 𝐵 ). Subsequently, we elaborate on the sequential, two-stage construction process: (1) Tree Construction , which parses the document's layout to establish a hierarchical nodes, each categorized by type; and (2) Graph Construction , which extracts fine-grained entity knowledge from the tree nodes and refines it through a novel gradient-based entity resolution method.", + "text_level": -1, + "page_idx": 2, + "pdf_id": 53, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Figure 2: The BookIndex Construction process. This phase includes Tree Construction, derived from Layout Parsing and Section Filtering, and Graph Construction, which involves KG Construction and Gradient-based Entity Resolution.", + "text_level": -1, + "page_idx": 3, + "pdf_id": 54, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "image", + "text": "", + "text_level": -1, + "page_idx": 3, + "pdf_id": 55, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-2.png", + "image_caption": [ + "cref='#/texts/52'" + ], + "image_footnote": [], + "middle_json": { + "docling_label": "picture" + } + }, + { + "type": "text", + "text": "4.1 Overview of BookIndex", + "text_level": 0, + "page_idx": 3, + "pdf_id": 56, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "We formally define our BookIndex as a triplet 𝐵 = ( 𝑇,𝐺, 𝑀 ) . Here, 𝑇 = ( 𝑁, 𝐸 𝑇 ) represents a Tree structure where 𝑁 is the set of nodes derived from the document's explicit logical hierarchy (e.g., titles, sections, tables), and 𝐸 𝑇 denotes their nesting relationships. 𝐺 = ( 𝑉, 𝐸 𝐺 ) is a Knowledge Graph that captures fine-grained entities ( 𝑉 ) and their relations ( 𝐸 𝐺 ) scattered throughout the document. Finally, 𝑀 𝑉 : →P( 𝑁 ) is the Graph-Tree Link (GT-Link) , which links each entity in 𝑉 to the set of specific tree nodes in 𝑁 from which it was extracted. These links are crucial for capturing the intricate, cross-sectional relations within the document. The hierarchical tree nodes in 𝑇 serve as the document's native information patches , providing structured contexts for information seeking. Meanwhile, the entities and relations in 𝐺 , connected via 𝑀 , act as the rich information scent that guides navigation between and within these patches.", + "text_level": -1, + "page_idx": 3, + "pdf_id": 57, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Figure 2 provides an example of our BookIndex. The Tree component, positioned at the top, organizes the document into a hierarchical structure, where content blocks such as text, tables, and images serve as leaf nodes nested within section nodes. The Graph component is composed of entities and relations extracted from these nodes. The GT-Link, illustrated by the blue dotted lines, explicitly connects these entities back to their corresponding tree nodes, thereby grounding the semantic entities within the document's logical hierarchy.", + "text_level": -1, + "page_idx": 3, + "pdf_id": 58, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "4.2 Tree Construction", + "text_level": 0, + "page_idx": 3, + "pdf_id": 59, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "The first stage transforms the raw document into a structured hierarchical tree 𝑇 . This involves two key steps: robust layout parsing and intelligent section filtering.", + "text_level": -1, + "page_idx": 3, + "pdf_id": 60, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "4.2.1 Layout Parsing. The Layout Parsing phase processes the input document 𝐷 (a collection of pages) using layout analysis and recognition models. This step identifies, extracts, and organizes diverse blocks (e.g., text, tables, images) from the document pages.", + "text_level": -1, + "page_idx": 3, + "pdf_id": 61, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "The output is a sequence of primitive blocks, B = { 𝑏 , 𝑏 1 2 , · · · , 𝑏 𝑘 } , where each block 𝑏 𝑖 = ( 𝑐 , 𝜏 , 𝑓 𝑖 𝑖 𝑖 ) is defined as a triplet. Here, 𝑐 𝑖 is the raw content (e.g., text, image data), 𝜏 𝑖 is the initial layout-based type (e.g., Title, Text, Table, Image ), and 𝑓 𝑖 is a vector of associated layout features (e.g., 'FontSize', bounding box).", + "text_level": -1, + "page_idx": 3, + "pdf_id": 62, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "4.2.2 Section Filtering. Next, the Section Filtering phase processes this initial sequence to identify the document's logically hierarchical structure. Layout Parsing identifies blocks as Title but does not assign their hierarchical level. Therefore, we select the candidate subset B title ⊂ B (where 𝜏 𝑖 = Title ) for an LLM-based analysis. To handle extremely long documents, this analysis is performed in batches, where each batch retains a contextual window of high-level section information (with 𝑙 = 1 as the root). The LLM analyzes the content 𝑐 𝑖 and layout features 𝑓 𝑖 of the candidates to determine two key properties: their actual hierarchical level 𝑙 𝑖 ∈ { 1 2 , , ... } and final node type 𝜏 ' 𝑖 (e.g., re-classifying an erroneous Title as Text if its level is 'None'). This step is crucial for preserving the document's logical hierarchy by correcting blocks erroneously parsed as Title , such as descriptive text within images or borderless table headers.", + "text_level": -1, + "page_idx": 3, + "pdf_id": 63, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Finally, the definitive tree 𝑇 = ( 𝑁, 𝐸 𝑇 ) is constructed. The node set 𝑁 is composed of all blocks from the filtering and re-classification process, where each node 𝑛 ∈ 𝑁 retains its content ( 𝑐 𝑖 ) and its final node type ( 𝜏 ' 𝑖 ) (e.g., Text , Section , Table , and Image ). The edge set 𝐸 𝑇 , representing the parent-child nesting relationships, is then established. Parent-child relationships are inferred by sequentially traversing the nodes, using both the determined hierarchical levels ( 𝑙 𝑖 ) of Section nodes and the overall document order to assemble the complete tree structure.", + "text_level": -1, + "page_idx": 3, + "pdf_id": 64, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "As an example shown in Figure 2, the Layout Parsing phase identifies diverse blocks, typing them as Title Text Table , , , and Image . During the Section Filtering phase, the Title candidates (e.g., \"Method\", \"Experiment\", and \"MOE Layer\") are analyzed by the LLM. The blocks 'Method' and 'Experiment' (both with 'FontSize: 14') are correctly identified as Section nodes at 'Level: 2'. Conversely,", + "text_level": -1, + "page_idx": 3, + "pdf_id": 65, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "4", + "text_level": -1, + "page_idx": 3, + "pdf_id": 66, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "the 'MOE Layer' block ('FontSize: 20'), which was erroneously tagged as Title by the parser, is re-classified by the LLM as a Text node with 'Level: None'. This correction is crucial for preserving the document's logical hierarchy. Following this process, all filtered and classified nodes are assembled into the final tree structure based on their determined levels and document order.", + "text_level": -1, + "page_idx": 4, + "pdf_id": 67, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "4.3 Graph Construction", + "text_level": 0, + "page_idx": 4, + "pdf_id": 68, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Once the tree 𝑇 is established, we proceed to populate the Knowledge Graph 𝐺 by extracting and refining entities from the tree nodes.", + "text_level": -1, + "page_idx": 4, + "pdf_id": 69, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "4.3.1 KG Construction. We iterate each node 𝑛 𝑖 ∈ 𝑁 from the previously constructed tree 𝑇 . For each node 𝑛 𝑖 , we extract a subgraph 𝑔 𝑖 = ( 𝑉 , 𝐸 𝑖 𝑅𝑖 ) based on its content 𝑐 𝑖 and final node type 𝜏 ' 𝑖 . This extraction is modality-dependent: if the node is text-only, an LLM is prompted to extract entities and relations, while for nodes containing visual elements (e.g., 𝜏 ' 𝑖 = Image ), a Vision Language Model (VLM) is employed to extract visual knowledge. Crucially, for every entity 𝑣 ∈ 𝑉 𝑖 extracted, its origin tree node 𝑛 𝑖 is recorded, which is vital for constructing the final mapping 𝑀 .", + "text_level": -1, + "page_idx": 4, + "pdf_id": 70, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Furthermore, to preserve structural semantics for specific logical types (e.g., Table , Formula ), our process first creates a distinct, typed entity (e.g., 𝑣 table representing the table itself). The other extracted entities from the specific node's content are linked to this primary vertex. For Table nodes specifically, row and column headers are also explicitly extracted as distinct entities and linked to 𝑣 table via a 'ContainedIn' relationship.", + "text_level": -1, + "page_idx": 4, + "pdf_id": 71, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "4.3.2 Gradient-based Entity Resolution. As shown in the literature [62, 66], a well-constructed KG is essential for document question answering. A common challenge in the extraction process is that the same conceptual entity is often fragmented into multiple distinct entities due to abbreviations, co-references, or its varied occurrences across different document sections. This necessitates a robust Entity Resolution (ER) process, which identifies and merges these fragmented entities to refine the raw KG.", + "text_level": -1, + "page_idx": 4, + "pdf_id": 72, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "However, conventional ER methods are computationally expensive. They are often designed for batch processing across multiple data sources (commonly referred to as dirty ER), aiming to ensure accurate entity resolution by finding all possible matching pairs [12]. This process typically requires finding the transitive closure of all detected matches. That is, to definitively merge multiple entities (e.g., A, B, and C) as the same concept, the system must ideally compare all possible pairs ('A-B', 'A-C', and 'B-C') to confirm their equivalence. This can lead to a quadratic ( 𝑂 𝑛 ( 2 ) ) number of pairwise comparisons, a process that becomes prohibitively slow and computationally expensive when relying on LLMs for high-accuracy judgments.", + "text_level": -1, + "page_idx": 4, + "pdf_id": 73, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "To address this, we employ a gradient-based ER method, operating on a single document (simplified as the clean ER), which performs ER incrementally as each new entity 𝑣 𝑛 is extracted. This transforms the quadratic batch problem into a simpler, repeated lookup task: determining where the single new entity 𝑣 𝑛 fits among the already-processed entities in the database. This incremental process yields two distinct, observable scoring patterns when 𝑣 𝑛 is reranked against its 𝑡𝑜𝑝 _ 𝑘 most relevant candidates:", + "text_level": -1, + "page_idx": 4, + "pdf_id": 74, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "5", + "text_level": -1, + "page_idx": 4, + "pdf_id": 75, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "Algorithm 1: Gradient-based entity resolution", + "text_level": 0, + "page_idx": 4, + "pdf_id": 76, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Input: KG 𝐺 , New entity 𝑣 𝑛 , Rerank model R , Entity vector database 𝐷𝐵 , Vector search number 𝑡𝑜𝑝 _ 𝑘 , threshold of gradient 𝑔", + "text_level": -1, + "page_idx": 4, + "pdf_id": 77, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "// Vector Search 𝑡𝑜𝑝 _ 𝑘 relevant entities in 𝐷𝐵 . 1 𝐸 𝑐 ← Search( 𝐷𝐵, 𝑣 𝑛 , 𝑡𝑜𝑝 _ 𝑘 ); 2 S ← R( 𝐸 , 𝑣 𝑐 𝑛 ) ; // Sort all candidate entities by rerank scores. 3 Sort( 𝐸 , 𝑐 S ); 4 𝑠𝑐𝑜𝑟𝑒 ← S[ ] 0 , 𝑆𝑒𝑙 ← 𝐸 𝑐 [ 0 ; ] // Gradient select similar entities. 5 for each remain entity 𝑣 𝑐 ∈ 𝐸 𝑐 \\ { 𝐸 𝑐 [ 0 ] } do 6 if S[ 𝑣 𝑐 ] > 𝑠𝑐𝑜𝑟𝑒 / 𝑔 then 7 𝑆𝑒𝑙 ← 𝑆𝑒𝑙 ∪ { 𝑣 𝑐 } , 𝑠𝑐𝑜𝑟𝑒 ← S[ 𝑣 𝑐 ] ; 8 else break; // Merge entity or add new entity. 9 if length( 𝑆𝑒𝑙 ) = length( 𝐸 𝑐 ) then 10 𝐺 ← AddNewEntity( 𝐺, 𝑣 𝑛 ), 𝐷𝐵 ← AddNew( 𝐷𝐵, 𝑣 𝑛 ); 11 else 12 if length( 𝑆𝑒𝑙 ) = 1 then 𝑣 𝑠𝑒𝑙 ← 𝑆𝑒𝑙 [ 0 ; ] 13 else 𝑣 𝑠𝑒𝑙 ← LLMSelect( 𝑆𝑒𝑙 ); 14 𝐺 ← MergeEntity( 𝐺, 𝑣 𝑛 , 𝑣 𝑠𝑒𝑙 ), 𝐷𝐵 ← Update( 𝐷𝐵, 𝑣 𝑠𝑒𝑙 , 𝑣 𝑛 ); 15 return 𝐺,𝐷𝐵 ;", + "text_level": -1, + "page_idx": 4, + "pdf_id": 78, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· Case A: New Entity. If 𝑣 𝑛 is a new conceptual entity, its relevance scores against all existing entities will be uniformly low, showing no significant gradient or discriminative pattern.", + "text_level": -1, + "page_idx": 4, + "pdf_id": 79, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· Case B: Existing Entity. If 𝑣 𝑛 is an alias of an existing entity, its scores will show a high relevance to the true match (or a small set of equivalent aliases). Due to the reranker's inherent discriminative limitations, this initial high-relevance set might occasionally contain multiple similar entities. This high-relevance set is then typically followed by a sharp decline (a large 'gradient') before transitioning to a gradual slope of irrelevant entities.", + "text_level": -1, + "page_idx": 4, + "pdf_id": 80, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "Our Gradient-based ER algorithm is designed precisely to detect this sharp decline (characteristic of Case B), allowing us to efficiently isolate the high-relevance set. Subsequently, an LLM is utilized for finer-grained distinction when multiple similar entities are identified within this set, differentiating it from the 'no gradient' scenario (Case A) without quadratic comparisons.", + "text_level": -1, + "page_idx": 4, + "pdf_id": 81, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Algorithm 1 shows the above entity resolution process. For a new entity 𝑣 𝑛 , we first retrieve its 𝑡𝑜𝑝 _ 𝑘 candidates 𝐸 𝑐 from the vector database 𝐷𝐵 , which are then reranked by R against 𝑣 𝑛 and sorted based on their scores S (Lines 1-3). We initialize the selection set 𝑆𝑒𝑙 with the top-scoring candidate 𝐸 𝑐 [ 0 ] and set the initial score to S[ ] 0 (Line 4). We then iterate through the remaining sorted candidates (Lines 5-8). The core logic checks if the current score S[ 𝑣 𝑐 ] is still within the gradient threshold 𝑔 of the previous score (i.e., S[ 𝑣 𝑐 ] > score / 𝑔 ). If the score drop is gentle (passes the check), the candidate 𝑣 𝑐 is added to 𝑆𝑒𝑙 , and score is updated (Lines 7-8); otherwise, the loop breaks (Line 8) as soon as a sharp score drop is detected. Finally, the algorithm makes its decision (Lines 9-14). If the selection set 𝑆𝑒𝑙 is identical to 𝐸 𝑐 , this indicates that all candidates passed the gradient check. This corresponds to Case A , where the scores lacked discriminative power (i.e., 𝑣 𝑛 is equally dissimilar to", + "text_level": -1, + "page_idx": 4, + "pdf_id": 82, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "all candidates), so 𝑣 𝑛 is added as a new entity (Line 9-10). Conversely, if a gradient was found (i.e., 𝑙𝑒𝑛𝑔𝑡ℎ 𝑆𝑒𝑙 ( ) < 𝑙𝑒𝑛𝑔𝑡ℎ ( 𝐸 𝑐 ) ), this signals Case B . We then select the canonical entity 𝑣 𝑠𝑒𝑙 from 𝑆𝑒𝑙 , using an LLM (Line 13) if the reranker identifies multiple aliases, and merge 𝑣 𝑛 with it (Lines 12-14). The updated 𝐺 and 𝐷𝐵 are then returned (Line 15).", + "text_level": -1, + "page_idx": 5, + "pdf_id": 83, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "For instance, considering the example in Figure 2, when the new entity 𝑒 9 is processed, it is first compared with existing entities in the KG. As depicted in the similarity curve (orange line), 𝑒 9 shows high similarity with 𝑒 7, followed by a sharp decline in similarity with other entities like 𝑒 6, 𝑒 8, and 𝑒 5. Our gradient-based selection process identifies 𝑒 7 as the unique, high-confidence match for 𝑒 9. Consequently, 𝑒 9 is merged with 𝑒 7, enriching the KG with consolidated information as shown in the final merged entity 𝑒 ' 7 .", + "text_level": -1, + "page_idx": 5, + "pdf_id": 84, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Graph-Tree Link (GT-Link). The GT-Link 𝑀 is formalized to complete the BookIndex 𝐵 = ( 𝑇,𝐺, 𝑀 ) . As described in the KG Construction phase, the origin tree node 𝑛 𝑖 is recorded for every newly extracted entity 𝑣 𝑖 . GT-Link is then refined during entity resolution: when an entity 𝑣 𝑛 is merged into a canonical entity 𝑣 𝑠𝑒𝑙 , the origin node set of 𝑣 𝑠𝑒𝑙 is updated to include all origin nodes previously associated with 𝑣 𝑛 . This aggregation process creates the final mapping 𝑀 : 𝑉 → P( 𝑁 ) , which bi-directionally links the entities in 𝐺 to the set of their structural locations (nodes) in 𝑇 .", + "text_level": -1, + "page_idx": 5, + "pdf_id": 85, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "5 AGENT-BASED RETRIEVAL", + "text_level": 0, + "page_idx": 5, + "pdf_id": 86, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Real-world document queries are often complex, necessitating operations like modal type filtering, semantic selection, and multi-hop reasoning. To address this, we propose an agent-based approach in BookRAG, which intelligently plans and executes operations on the BookIndex. We first introduce the overall workflow and present two core mechanisms: Agent-based Planning , which formulates the strategy, and the Structured Execution , which includes the retrieval process under the principles of IFT and generation.", + "text_level": -1, + "page_idx": 5, + "pdf_id": 87, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "5.1 Overall Workflow", + "text_level": 0, + "page_idx": 5, + "pdf_id": 88, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "The overall workflow of agent-based retrieval, illustrated in Figure 3, follows a three-stage pipeline designed to address users' queries systematically.", + "text_level": -1, + "page_idx": 5, + "pdf_id": 89, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "1. Agent-based Planning. BookRAG first performs Classification & Plan . This stage aims to distinguish simple keyword-based queries from reasoning questions that require decomposition and analysis. For instance, a query like 'How does Transformer differ from RNNs in handling long-range dependencies?' cannot be", + "text_level": -1, + "page_idx": 5, + "pdf_id": 90, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "Figure 3: The general workflow of agent-based retrieval in BookRAG, which contains agent-based planning, retrieval, and generation processes.", + "text_level": -1, + "page_idx": 5, + "pdf_id": 91, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "image", + "text": "", + "text_level": -1, + "page_idx": 5, + "pdf_id": 92, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-3.png", + "image_caption": [ + "cref='#/texts/89'" + ], + "image_footnote": [], + "middle_json": { + "docling_label": "picture" + } + }, + { + "type": "text", + "text": "solved by retrieving from a single keyword. Therefore, the planning stage first performs query classification . Based on this classification and a predefined set of operators designed for the BookIndex, it generates a specific operators plan that effectively guides the retrieval and generation strategies.", + "text_level": -1, + "page_idx": 5, + "pdf_id": 93, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "2. Retrieval Process. Guided by the operator plan, the retrieval process executes Scent/Filter-based Retrieval . This stage navigates the BookIndex 𝐵 = ( 𝑇,𝐺, 𝑀 ) , either utilizing a scent-based retrieval principle (e.g., following relevant entities in 𝐺 ) to find information, or employing various filters (e.g., modal type) to refine the selection. After reasoning, BookRAG gets the retrieval set of highly relevant information blocks from the BookIndex.", + "text_level": -1, + "page_idx": 5, + "pdf_id": 94, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "3. Generation Process. Finally, all retrieved information enters the generation stage for Analysis & Merging . This stage synthesizes these (often fragmented) pieces of evidence, performs final analysis, and formulates a coherent response.", + "text_level": -1, + "page_idx": 5, + "pdf_id": 95, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "5.2 Agent-based Planning", + "text_level": 0, + "page_idx": 5, + "pdf_id": 96, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "The planning stage is the core of BookRAG, designed to intelligently navigate our BookIndex 𝐵 = ( 𝑇,𝐺, 𝑀 ) . To support flexible retrieval, we define four types of operators: Formulator, Selector, Reasoner, and Synthesizer. These operators can be arbitrarily combined to form tailored execution pipelines, each with adjustable parameters. BookRAG dynamically configures and assembles these operators to adapt to the specific requirements of different query categories. This process involves two sequential steps: first, the agent performs", + "text_level": -1, + "page_idx": 5, + "pdf_id": 97, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Table 2: Three common query categories addressed in BookRAG.", + "text_level": -1, + "page_idx": 5, + "pdf_id": 98, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "table", + "text": "", + "text_level": -1, + "page_idx": 5, + "pdf_id": 99, + "img_path": "", + "table_caption": [ + "cref='#/texts/95'" + ], + "table_footnote": [], + "table_body": "| Query Category | Description | Core Task | Example Query |\n|--------------------|-------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------|\n| Single-hop | Queries with a single, distinct information target. | Scent-based Retrieval : Retrieve content related to a specific entity or section. | What is the definition of Information Scent? |\n| Multi-hop | Queries that require synthesizing information from multiple blocks, often by decomposing into sub-problems. | Decomposing & Merging : Decompose into sub-problems, retrieve for each, and synthesize the final answer. | How does Transformer differ from RNNs in handling long-range dependencies? |\n| Global Aggregation | Queries that require filtering across the entire document and performing calculations. | Filter & Aggregation : Apply filters across the document & perform aggregation operations (e.g., Count, Sum). | How many figures related to IFT are in Section 4? |", + "middle_json": { + "docling_label": "table" + } + }, + { + "type": "text", + "text": "6", + "text_level": -1, + "page_idx": 5, + "pdf_id": 100, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "(a) Operator Set", + "text_level": 0, + "page_idx": 6, + "pdf_id": 101, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Figure 4: The BookRAG Operator Library and an Execution Example from MMLongBench dataset: (a) a visual depiction of the four operator types (Formulator, Selector, Reasoner, and Synthesizer) and (b) an execution trace for a 'Single-hop' query, demonstrating the agent-based planning and step-by-step operator execution.", + "text_level": -1, + "page_idx": 6, + "pdf_id": 102, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "image", + "text": "", + "text_level": -1, + "page_idx": 6, + "pdf_id": 103, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-4.png", + "image_caption": [ + "cref='#/texts/98'" + ], + "image_footnote": [], + "middle_json": { + "docling_label": "picture" + } + }, + { + "type": "text", + "text": "Query Classification to determine the appropriate solution strategy, then generates a specific Operator Plan .", + "text_level": -1, + "page_idx": 6, + "pdf_id": 104, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "· Query Classification . To enable agent strategy selection, we focus on three representative query categories defined by their intrinsic complexity and operational demands (Table 2): Single-hop , Multi-hop , and Global Aggregation . This classification is crucial because each category requires a different solution strategy. For instance, a Single-hop query typically requires a single piece of information retrieved via a Scent-based Retrieval operation. In contrast, a Global Aggregation query often necessitates analyzing content under multiple filtering conditions, usually involving a sequence of Filter & Aggregation operations across various parts of the document. Furthermore, BookRAG is designed to be extensible, allowing for the resolution of a broader range of query types by integrating additional operators.", + "text_level": -1, + "page_idx": 6, + "pdf_id": 105, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· BookIndex Operators . To execute the strategies identified by classification, we designed a set of operators ( O ) tailored for the BookIndex 𝐵 = ( 𝑇,𝐺, 𝑀 ) . These operators, visually depicted in Figure 4(a) and detailed in Table 3, define the set of operations the agent can employ for diverse query categories. We group them into four types, which we describe in sequence:", + "text_level": -1, + "page_idx": 6, + "pdf_id": 106, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "❶ Formulator. These are LLM-based operators that prepare the query for execution. This category includes Decompose , which breaks a Complex query into a set of simpler, actionable sub-queries 𝑄 𝑠 . It also includes Extract , which employs an LLM to identify key entities 𝐸 𝑞 from the query text and link them to corresponding entities in the KG, 𝐺 :", + "text_level": -1, + "page_idx": 6, + "pdf_id": 107, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "equation", + "text": "𝑄 𝑠 = LLM ( 𝑃 𝐷𝑒𝑐 , 𝑞 ) = { 𝑞 , 𝑞 1 2 , . . . , 𝑞 𝑘 } (2)", + "text_level": -1, + "page_idx": 6, + "pdf_id": 108, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "equation", + "text": "𝐸 𝑞 = LLM ( 𝑃 𝐸𝑥𝑡 , 𝑞 ) = { 𝑒 1 , 𝑒 2 , . . . , 𝑒 𝑚 } (3)", + "text_level": -1, + "page_idx": 6, + "pdf_id": 109, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "Here, 𝑞 is the original user query, while 𝑃 𝐷𝑒𝑐 and 𝑃 𝐸𝑥𝑡 represent the prompts used to guide the LLM for the decomposition and extraction tasks, respectively.", + "text_level": -1, + "page_idx": 6, + "pdf_id": 110, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "❷ Selector. These operators filter or select specific content ranges from the BookIndex. Filter_Modal and Filter_Range directly apply the explicit constraints 𝐶 (e.g., modal types, page ranges) generated during the plan. Operating on the Tree 𝑇 = ( 𝑁, 𝐸 𝑇 ) , these operators produce a filtered subset 𝑁 𝑓 where the predicate 𝐶 𝑛 ( ) holds true for each node:", + "text_level": -1, + "page_idx": 6, + "pdf_id": 111, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "equation", + "text": "𝑁 𝑓 = { 𝑛 ∈ 𝑁 | 𝐶 𝑛 ( )} (4)", + "text_level": -1, + "page_idx": 6, + "pdf_id": 112, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "In contrast, Select_by_Entity and Select_by_Section target contiguous document segments by retrieving subtrees rooted at specific section nodes. This process first identifies a set of target section nodes 𝑆 target ⊂ 𝑁 at a specified depth, where 𝑆 target consists of sections either linked to entities 𝐸 𝑞 via the GT-Link 𝑀 or selected by the LLM. It then retrieves all descendants of these targets to form the selected node set 𝑁 𝑠 :", + "text_level": -1, + "page_idx": 6, + "pdf_id": 113, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "equation", + "text": "𝑁 𝑠 = GLYPH<216> 𝑠 ∈ 𝑆 target Subtree ( 𝑠 ) (5)", + "text_level": -1, + "page_idx": 6, + "pdf_id": 114, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "❸ Reasoner. These operators analyze and refine selected tree nodes. Graph_Reasoning performs multi-hop inference on a subgraph 𝐺 ' ( 𝑉 , 𝐸 ' ' ) (extracted from selected nodes 𝑁 𝑠 ) starting from entity 𝑒 . Starting from the retrieved entities, it computes an entity importance vector 𝐼 𝐺 ∈ R | 𝑉 ' | over the subgraph 𝐺 ' using the PageRank algorithm [20]. These entity scores are then mapped to the tree nodes via the GT-Link matrix 𝑀 to derive the final tree node importance scores vector 𝑆 𝐺 ∈ R | 𝑁 𝑠 | :", + "text_level": -1, + "page_idx": 6, + "pdf_id": 115, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "equation", + "text": "𝐼 𝐺 = PageRank ( 𝐺 , 𝑒 ' ) (6)", + "text_level": -1, + "page_idx": 6, + "pdf_id": 116, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "equation", + "text": "𝑆 𝐺 = 𝐼 𝐺 × 𝑀 (7)", + "text_level": -1, + "page_idx": 6, + "pdf_id": 117, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "Text_Ranker evaluates the semantic relevance of the tree node's content to the query 𝑞 , assigning a relevance score 𝑆 𝑇 to each node. Skyline_Ranker employs the Skyline operator to filter nodes based on these multiple criteria (e.g., 𝑆 𝐺 and 𝑆 𝑇 ), retaining only", + "text_level": -1, + "page_idx": 6, + "pdf_id": 118, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "7", + "text_level": -1, + "page_idx": 6, + "pdf_id": 119, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "those nodes that are not dominated by any others in terms of the specified scoring dimensions.", + "text_level": -1, + "page_idx": 7, + "pdf_id": 120, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "❹ Synthesizer. These operators are responsible for content generation. Map performs analysis on specific retrieved information segments to generate partial responses. Reduce synthesizes a final coherent answer by aggregating information from multiple sources, such as partial answers or a collection of retrieved evidence.", + "text_level": -1, + "page_idx": 7, + "pdf_id": 121, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· Operator Plan . After classifying the query ( 𝑞 ) into its category ( 𝑐 ), the agent's final task is to generate an executable plan 𝑃 . This plan is a specific sequence of operators ⟨ 𝑜 , . . . , 𝑜 1 𝑛 ⟩ selected from our library O with parameters dynamically instantiated based on 𝑞 . This process is formulated as:", + "text_level": -1, + "page_idx": 7, + "pdf_id": 122, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "equation", + "text": "𝑃 = Agent Plan ( 𝑞, 𝑐, O) (8)", + "text_level": -1, + "page_idx": 7, + "pdf_id": 123, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "The plan follows a structured workflow tailored to each category:", + "text_level": -1, + "page_idx": 7, + "pdf_id": 124, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "· Single-hop : The agent first attempts to Extract an entity. If successful, it executes a 'scent-based' selection; otherwise, it falls back to a section-based strategy. Both paths then proceed to standard reasoning and generation, denoted as 𝑃 std .", + "text_level": -1, + "page_idx": 7, + "pdf_id": 125, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "equation", + "text": "𝑃 s = ( Extract success - - - - -→ Select_by_Entity → 𝑃 std Extract fail - -→ Select_by_Section → 𝑃 std (9)", + "text_level": -1, + "page_idx": 7, + "pdf_id": 126, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "equation", + "text": "𝑃 std = ( Graph ∥ Text ) → Skyline → Reduce (10)", + "text_level": -1, + "page_idx": 7, + "pdf_id": 127, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "· Complex : The agent first decomposes the problem, applies the Single-hop workflow 𝑃 s to each sub-problem, and finally synthesizes the results.", + "text_level": -1, + "page_idx": 7, + "pdf_id": 128, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "equation", + "text": "𝑃 complex = Decompose → 𝑃 s → Map → Reduce (11)", + "text_level": -1, + "page_idx": 7, + "pdf_id": 129, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "· Global Aggregation : The workflow involves applying a sequence of filters followed by synthesis.", + "text_level": -1, + "page_idx": 7, + "pdf_id": 130, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "equation", + "text": "𝑃 global = GLYPH<214> ( Filter_Modal | Filter_Range ) → Map → Reduce (12)", + "text_level": -1, + "page_idx": 7, + "pdf_id": 131, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "Here, the symbol ˛ denotes the nested composition of filters, applying either a modal or range filter at each step.", + "text_level": -1, + "page_idx": 7, + "pdf_id": 132, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "5.3 Structured Execution", + "text_level": 0, + "page_idx": 7, + "pdf_id": 133, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Following the planning stage, BookRAG executes the generated workflow 𝑃 . This execution phase embodies the cognitive principles of Information Foraging Theory (IFT), effectively translating abstract textual queries into concrete operations. Specifically, the Selector operators mirror the act of 'navigating to information patches,' narrowing the vast document space down to relevant scopes. Subsequently, the Reasoner operators perform 'sensemaking within patches,' where they analyze and refine the information within these focused scopes. Finally, the Synthesizer generates the answer based on the processed evidence. This design minimizes the cost of attention by ensuring computational resources are focused solely on high-value data patches.", + "text_level": -1, + "page_idx": 7, + "pdf_id": 134, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Scent/Filter-based Retrieval. The execution begins by narrowing the scope. Aligning with IFT, Selector operators identify relevant 'patches' by following 'information scents' (e.g., key entities in question) or applying explicit filter constraints. This process reduces the full node set 𝑁 to a focused node subset 𝑁 𝑠 :", + "text_level": -1, + "page_idx": 7, + "pdf_id": 135, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "equation", + "text": "𝑁 𝑠 = Selector ( 𝑁, params sel ) (13)", + "text_level": -1, + "page_idx": 7, + "pdf_id": 136, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "This pre-selection minimizes noise and ensures that subsequent reasoning is applied only to highly relevant contexts, optimizing the foraging cost. Subsequently, within this focused scope, Reasoner operators evaluate nodes using multiple dimensions, such as graph topology and semantic relevance. We then employ the Skyline_Ranker to get the final retrieval set. Unlike fixed top𝑘 retrieval, the Skyline operator retains the Pareto frontier of nodes, retaining nodes that are valuable in at least one dimension while discarding dominated ones:", + "text_level": -1, + "page_idx": 7, + "pdf_id": 137, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "equation", + "text": "𝑁 𝑅 = Skyline_Ranker ({ 𝑆 𝐺 ( 𝑛 , 𝑆 ) 𝑇 ( 𝑛 ) | 𝑛 ∈ 𝑁 𝑠 }) (14)", + "text_level": -1, + "page_idx": 7, + "pdf_id": 138, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "Analysis & Merging Generation. In the final stage, the Synthesizer operator generates the coherent answer by aggregating the refined evidence:", + "text_level": -1, + "page_idx": 7, + "pdf_id": 139, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "equation", + "text": "𝐴 = Synthesizer ( 𝑞, 𝑁 𝑅 ) (15)", + "text_level": -1, + "page_idx": 7, + "pdf_id": 140, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "Table 3: Operators utilized in our BookRAG, categorized by their function.", + "text_level": -1, + "page_idx": 7, + "pdf_id": 141, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "table", + "text": "", + "text_level": -1, + "page_idx": 7, + "pdf_id": 142, + "img_path": "", + "table_caption": [ + "cref='#/texts/136'" + ], + "table_footnote": [], + "table_body": "| Operator | Type | Description | Parameters |\n|-------------------------------|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------|\n| Decompose | Formulator Formulator | Decompose a complex query into simpler, actionable sub-queries. Identify and extract key entities from the query (links to 𝐺 ). | (Self-contained) (Self-contained) |\n| Extract Filter_Modal | Selector | Filter retrieved nodes by their modal type (e.g., Table, Figure). | modal_type: str |\n| Filter_Range | Selector | Filter nodes based on a specified range (e.g., pages, section). | range: (start, end) |\n| Select_by_Entity | Selector | Selects all tree nodes ( 𝑁 ) in sections linked to a given entity ( 𝑉 ). | entity_name: str |\n| Select_by_Section | Selector | Uses an LLM to select relevant sections and selects all tree nodes ( 𝑁 ) within them. | query: str, sections: List[str] |\n| Graph_Reasoning | Reasoner | Performs multi-hop reasoning on subgraph ( 𝐺 ' ) and score tree nodes ( 𝑁 ) using graph importance and GT-links. Rerank retrieved tree nodes ( 𝑁 ) based on the relevance. | start_entity: str, subgraph: 𝐺 ' query: str |\n| Text_Reasoning Skyline_Ranker | Reasoner Reasoner | | criteria: List[str] |\n| | | Rerank nodes based on multiple criteria. | (Input: List[str]) |\n| Map | Synthesizer | Uses partially retrieved information to generate a partial answer. | |\n| Reduce | Synthesizer | Synthesizes the final answer from partial information or all sub-problem answers. | (Input: List[str]) |", + "middle_json": { + "docling_label": "table" + } + }, + { + "type": "text", + "text": "8", + "text_level": -1, + "page_idx": 7, + "pdf_id": 143, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "The Map operator performs fine-grained analysis on individual evidence blocks or sub-problems (from Decompose ) to generate intermediate insights. The Reduce operator then aggregates these partial results, such as answers to decomposed sub-queries or statistical counts from a global filter, to construct the final response. This separation ensures that the system can handle both detailed content extraction and high-level reasoning synthesis effectively.", + "text_level": -1, + "page_idx": 8, + "pdf_id": 144, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "To illustrate this end-to-end process, Figure 4(b) presents an execution trace for a 'Single-hop' query: 'What is the type of car in the Ranking Prompt example?'. In the planning phase, the agent classifies the query and generates a specific workflow. Subsequently, it identifies key entities (e.g., 'car') via Extract , retrieves relevant nodes via Select_by_Entity , refines them through reasoning and Skyline filtering, and finally synthesizes the answer using Reduce .", + "text_level": -1, + "page_idx": 8, + "pdf_id": 145, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "6 EXPERIMENTS", + "text_level": 0, + "page_idx": 8, + "pdf_id": 146, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "In our experiments, we evaluate BookRAG against several strong baseline methods, with an in-depth comparison of their efficiency and accuracy on document QA tasks.", + "text_level": -1, + "page_idx": 8, + "pdf_id": 147, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "6.1 Setup", + "text_level": 0, + "page_idx": 8, + "pdf_id": 148, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Table 4: Datasets used in our experiments. EM and F1 denote Exact Match and F1-score, respectively.", + "text_level": -1, + "page_idx": 8, + "pdf_id": 149, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "table", + "text": "", + "text_level": -1, + "page_idx": 8, + "pdf_id": 150, + "img_path": "", + "table_caption": [ + "cref='#/texts/143'" + ], + "table_footnote": [], + "table_body": "| Dataset | MMLongBench | M3DocVQA | Qasper |\n|-------------|---------------|------------|--------------|\n| Questions | 669 | 633 | 640 |\n| Documents | 85 | 500 | 192 |\n| Avg. Pages | 42.16 | 8.52 | 10.95 |\n| Avg. Images | 25.92 | 3.51 | 3.43 |\n| Tokens | 2,816,155 | 3,553,774 | 2,265,349 |\n| Metrics | EM, F1 | EM, F1 | Accuracy, F1 |", + "middle_json": { + "docling_label": "table" + } + }, + { + "type": "text", + "text": "Datasets & Question Synthesis. We use three widely adopted benchmarking datasets for complex document QA tasks: MMLongBench [33], M3DocVQA [11], and Qasper [14]. MMLongBench is a comprehensive benchmark designed to evaluate QA capabilities on long-form documents, covering diverse categories such as guidebooks, financial reports, and industry files. M3DocVQA is an open-domain benchmark designed to test RAG systems on a diverse collection of HTML-type documents sourced from Wikipedia pages 1 . Qasper is a QA dataset focused on scientific papers, where questions require retrieving evidence from the entire document. We filtered the datasets to remove documents with low clarity or incoherent structures. To address the scarcity of global-level questions in the original benchmarks, we synthesize additional QA pairs by having an LLM generate global questions from selected document elements (e.g., tables or figures). These questions are then answered and meticulously refined by human annotators via an outsourcing process, with this additional QA pairs constituting less than 20% of our final QA pairs. The statistics of these datasets are presented in Table 4.", + "text_level": -1, + "page_idx": 8, + "pdf_id": 151, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "1 https://www.wikipedia.org/", + "text_level": -1, + "page_idx": 8, + "pdf_id": 152, + "middle_json": { + "docling_label": "footnote" + } + }, + { + "type": "text", + "text": "9", + "text_level": -1, + "page_idx": 8, + "pdf_id": 153, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "Metrics. Weadheretotheofficial metrics specified by each dataset for QA. Our primary evaluation relies on Exact Match (EM), accuracy, and token-based F1-score. To assess efficiency, we also measure time cost and token usage during the response phase. Additionally, for methods including PDF parsing, we also evaluate retrieval recall. To establish the ground truth for this, we manually label the specific PDF blocks (e.g., texts, titles, tables, images, and formulas) required to answer each question. This labeling process is guided by the metadata of ground-truth evidence provided in each dataset; we filter candidate blocks using the given modality (all datasets), page numbers (MMLongBench), and evidence statements (Qasper). Any blocks that remained non-unique after this filtering process are manually annotated. In cases where a PDF parsing error made the ground-truth item unavailable, the retrieval recall for that query is recorded as 0.", + "text_level": -1, + "page_idx": 8, + "pdf_id": 154, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Baselines. Our experiments consider three model configurations:", + "text_level": -1, + "page_idx": 8, + "pdf_id": 155, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "· Conventional RAG: These methods are the most common pipeline for document analysis, where the raw text is first extracted and then chunked into segments of a specified size. We select strong and widely used retrieval models: BM25 [44] and Vanilla RAG. We also implement Layout+Vanilla, a variant that uses document layout analysis for semantic chunking.", + "text_level": -1, + "page_idx": 8, + "pdf_id": 156, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· Graph-based RAG: These methods first extract textual content from documents and then leverage graph data during retrieval. We select RAPTOR [45] and GraphRAG [16]. Specifically, GraphRAG has two versions: GraphRAG-Global and GraphRAG-Local, which employ global and local search methods, respectively.", + "text_level": -1, + "page_idx": 8, + "pdf_id": 157, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· LayoutsegmentedRAG: This category encompasses methods that utilize layout analysis to segment document content into discrete structural units. We include: MM-Vanilla, which utilizes multi-modal embeddings for visual and textual content; a tree-based method inspired by PageIndex [39], denoted as TreeTraverse, where an LLM navigates the document's tree structure; DocETL [47], a declarative system for complex document processing; and GraphRanker, a graphbased method extended from HippoRAG [19] that applies Personalized PageRank [20] to rank the relevant nodes.", + "text_level": -1, + "page_idx": 8, + "pdf_id": 158, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "Implementation details. For a fair comparison, both BookRAG and all baseline methods are powered by a unified set of state-of-theart (SOTA) and widely adopted backbone models from the Qwen family [4, 60, 63, 64]. We employ MinerU [52] for robust document layout parsing. We set the threshold of gradient 𝑔 as 0 6, and more . details are provided in the appendix of our technical report [57]. Our source code, prompts, and detailed configurations are available at github.com/sam234990/BookRAG.", + "text_level": -1, + "page_idx": 8, + "pdf_id": 159, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "6.2 Overall results", + "text_level": 0, + "page_idx": 8, + "pdf_id": 160, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "In this section, we present a comprehensive evaluation of BookRAG, analyzing its complex QA performance, retrieval effectiveness, and query efficiency compared to state-of-the-art baselines.", + "text_level": -1, + "page_idx": 8, + "pdf_id": 161, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "· QA Performance of BookRAG . We compare the QA performance of BookRAG against three categories of baselines, as shown in Table 5. The results indicate that BookRAG achieves", + "text_level": -1, + "page_idx": 8, + "pdf_id": 162, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "Table 5: Performance comparison of different methods across various datasets for solving complex document QA tasks. The best and second-best results are marked in bold and underlined, respectively.", + "text_level": -1, + "page_idx": 9, + "pdf_id": 163, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "table", + "text": "", + "text_level": -1, + "page_idx": 9, + "pdf_id": 164, + "img_path": "", + "table_caption": [ + "cref='#/texts/156'" + ], + "table_footnote": [], + "table_body": "| Baseline Type | Method | MMLongBench | MMLongBench | M3DocVQA | M3DocVQA | Qasper | Qasper |\n|----------------------|------------------|---------------|---------------|---------------|------------|------------|------------|\n| | Method | (Exact Match) | (F1-score) | (Exact Match) | (F1-score) | (Accuracy) | (F1-score) |\n| Conventional RAG | BM25 | 18.3 | 20.2 | 34.6 | 37.8 | 38.1 | 42.5 |\n| Conventional RAG | Vanilla RAG | 16.5 | 18.0 | 36.5 | 40.2 | 40.6 | 44.4 |\n| | Layout + Vanilla | 18.1 | 19.8 | 36.9 | 40.2 | 40.7 | 44.6 |\n| Graph-based RAG | RAPTOR | 21.3 | 21.8 | 34.3 | 37.3 | 39.4 | 44.1 |\n| Graph-based RAG | GraphRAG-Local | 7.7 | 8.5 | 23.7 | 25.6 | 35.9 | 39.2 |\n| Graph-based RAG | GraphRAG-Global | 5.3 | 5.6 | 20.2 | 22.0 | 24.0 | 24.1 |\n| Layout segmented RAG | MM-Vanilla | 6.8 | 8.4 | 25.1 | 27.7 | 27.9 | 29.3 |\n| Layout segmented RAG | Tree-Traverse | 12.7 | 14.4 | 33.3 | 36.2 | 27.3 | 32.1 |\n| Layout segmented RAG | GraphRanker | 21.2 | 22.7 | 43.0 | 47.8 | 32.9 | 37.6 |\n| Layout segmented RAG | DocETL | 27.5 | 28.6 | 40.9 | 43.3 | 42.3 | 50.4 |\n| Our proposed | BookRAG | 43.8 | 44.9 | 61.0 | 66.2 | 55.2 | 61.1 |", + "middle_json": { + "docling_label": "table" + } + }, + { + "type": "text", + "text": "state-of-the-art performance across all datasets, substantially outperforming the top-performing baseline by 18.0% in Exact Match on M3DocVQA. Layout + Vanilla consistently outperforms Vanilla RAG, confirming that layout parsing preserves essential structural information for better retrieval. Besides, the suboptimal results of Tree-Traverse and GraphRanker highlight the limitations of relying solely on hierarchical navigation or graph-based reasoning, which often miss cross-sectional context or drift into irrelevant scopes. In contrast, BookRAG's superiority stems from the synergy of its unified Tree-Graph BookIndex and Agent-based Planning. By effectively classifying queries and configuring optimal workflows, our BookRAG overcomes limitations of context fragmentation and static query workflow within existing baselines, ensuring precise evidence retrieval and accurate generation.", + "text_level": -1, + "page_idx": 9, + "pdf_id": 165, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Table 6: Retrieval recall comparison among layout-based methods. The best and second-best results are marked in bold and underlined, respectively.", + "text_level": -1, + "page_idx": 9, + "pdf_id": 166, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "table", + "text": "", + "text_level": -1, + "page_idx": 9, + "pdf_id": 167, + "img_path": "", + "table_caption": [ + "cref='#/texts/158'" + ], + "table_footnote": [], + "table_body": "| Method | MMLongBench | M3DocVQA | Qasper |\n|------------------|---------------|------------|----------|\n| Layout + Vanilla | 26.3 | 33.8 | 33.5 |\n| MM-Vanilla | 7.5 | 19.7 | 14.9 |\n| Tree-Traverse | 11.2 | 19.5 | 14.5 |\n| GraphRanker | 26.4 | 44.5 | 28.6 |\n| BookRAG | 57.6 | 71.2 | 63.5 |", + "middle_json": { + "docling_label": "table" + } + }, + { + "type": "text", + "text": "· Retrieval performance of BookRAG. To validate our retrieval design, we evaluate the retrieval recall of BookRAG against other layout-based baselines on the ground-truth layout blocks. The experimental results demonstrate that BookRAG achieves the highest recall across all datasets, notably reaching 71.2% on M3DocVQA and significantly outperforming the next best baseline (GraphRanker, max44.5%). This performance advantage stems from our IFT-inspired Selector → Reasoner workflow: the Agent-based Planning first classifies the query, enabling the Selector to narrow the search to a precise information patch , followed by the Reasoner's analysis. Crucially, after the Skyline_Ranker process, the average number of retained nodes is 9.87, 6.86, and 8.6 across the three datasets,", + "text_level": -1, + "page_idx": 9, + "pdf_id": 168, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "which is comparable to the standard top𝑘 ( 𝑘 = 10) setting, ensuring high-quality retrieval without inflating the candidate size.", + "text_level": -1, + "page_idx": 9, + "pdf_id": 169, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Figure 5: Comparison of query efficiency.", + "text_level": -1, + "page_idx": 9, + "pdf_id": 170, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "image", + "text": "", + "text_level": -1, + "page_idx": 9, + "pdf_id": 171, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-5.png", + "image_caption": [ + "cref='#/texts/161'" + ], + "image_footnote": [], + "middle_json": { + "docling_label": "picture" + } + }, + { + "type": "text", + "text": "· Efficiency of BookRAG. Wefurther evaluate the efficiency in terms of query time and token consumption, as illustrated in Figure 5. Overall, BookRAG maintains time and token costs comparable to existing Graph-based RAG methods. While purely text-based RAG approaches generally exhibit lower latency and token usage due to the absence of VLM processing for images, BookRAG maintains a balanced efficiency among multi-modal methods. In terms of token usage, BookRAG reduces consumption by an order of magnitude compared to the strongest baseline, DocETL. Notably,", + "text_level": -1, + "page_idx": 9, + "pdf_id": 172, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "10", + "text_level": -1, + "page_idx": 9, + "pdf_id": 173, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "on the MMLongBench dataset, DocETL consumes over 53 million tokens, whereas BookRAG requires less than 5 million. Regarding the query latency, our method also achieves a speedup of up to 2 × compared to DocETL.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 174, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "6.3 Detailed Analysis", + "text_level": 0, + "page_idx": 10, + "pdf_id": 175, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "In this section, we provide a more in-depth examination of our BookRAG. We first conduct an ablation study to validate the contribution of each component, followed by an experiment on the impact of gradient-based ER and QA performance across different query types. Furthermore, we perform a comprehensive error analysis, compare the effectiveness of our entity resolution method, and present a case study.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 176, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "· Ablation study. To evaluate the contribution of each core component in BookRAG, we design several variants by removing specific components:", + "text_level": -1, + "page_idx": 10, + "pdf_id": 177, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· w/o Gradient ER: Replaces the gradient-based entity resolution with a Basic ER by merging the same-name entities.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 178, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· w/o Planning: Removes the Agent-based Planning, defaulting to a static, standard workflow for all queries.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 179, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· w/o Selector : Removes the Selector operators, forcing Reasoners to score all candidate nodes.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 180, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· w/o Graph_Reasoning : Removes the Graph_Reasoning operator. Consequently, the Skyline_Ranker is also disabled as scoring becomes single-dimensional.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 181, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· w/o Text_Reasoning : Removes the Text_Reasoning operator. Similarly, the Skyline_Ranker is disabled, relying solely on graph-based scores.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 182, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "Table 7: Comparing the QA performance of different variants of BookRAG. EM and F1 denote Exact Match and F1-score, respectively.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 183, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "table", + "text": "", + "text_level": -1, + "page_idx": 10, + "pdf_id": 184, + "img_path": "", + "table_caption": [ + "cref='#/texts/220'" + ], + "table_footnote": [], + "table_body": "| Method variants | MMLongBench | MMLongBench | Qasper | Qasper |\n|---------------------|---------------|---------------|----------|----------|\n| | EM | F1 | Accuracy | F1 |\n| BookRAG (Full) | 43.8 | 44.9 | 55.2 | 61.1 |\n| w/o gradient ER | 40.1 | 42.8 | 48.9 | 57.3 |\n| w/o Planning | 30.8 | 33.2 | 40.9 | 48.5 |\n| w/o Selector | 42.5 | 43.1 | 52.5 | 59.1 |\n| w/o Graph_Reasoning | 39.8 | 41.5 | 51.4 | 58.4 |\n| w/o Text_Reasoning | 39.0 | 40.3 | 47.2 | 52.5 |", + "middle_json": { + "docling_label": "table" + } + }, + { + "type": "text", + "text": "The first variant evaluates the impact of KG quality on retrieval performance. The second and third variants assess the necessity of our Agent-based Planning and IFT-inspired selection mechanism, respectively. Finally, the last two variants validate the effectiveness of our multi-dimensional reasoning and dynamic Skyline filtering strategy. As shown in Table 7, the performance degradation across all variants confirms the essential role of each module in BookRAG. Specifically, the performance drop in the w/o Gradient ER variant highlights the critical role of a high-quality, connectivity-rich KG in supporting effective reasoning. Removing the Planning mechanism results in the most significant performance loss, confirming that a static workflow is insufficient for handling diverse types of queries. The w/o Selector variant, while maintaining competitive accuracy, incurs a prohibitive computational cost ( > 2 × tokens on Qasper),", + "text_level": -1, + "page_idx": 10, + "pdf_id": 185, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "11", + "text_level": -1, + "page_idx": 10, + "pdf_id": 186, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "validating the efficiency of our IFT-inspired \"narrow-then-reason\" strategy.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 187, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Figure 6: Comparison of graph statistics. Values are normalized to the Basic setting (Baseline=1.0). Absolute values for Basic are annotated. Note that density values are abbreviated (e.g., 3.6E-3 denotes 3 6 . × 10 -3 ).", + "text_level": -1, + "page_idx": 10, + "pdf_id": 188, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "image", + "text": "", + "text_level": -1, + "page_idx": 10, + "pdf_id": 189, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-6.png", + "image_caption": [ + "cref='#/texts/224'" + ], + "image_footnote": [], + "middle_json": { + "docling_label": "picture" + } + }, + { + "type": "text", + "text": "· Impact of Gradient-based Entity Resolution. To evaluate the quality of our constructed KG, we compare the graph statistics of our Gradient-based ER against a Basic KG construction. The Basic setting employs simple exact name matching for entity merging, which is standard practice in many graph-based methods. Figure 6 presents the comparative results, normalizing the metrics (Entity count, Density, Diameter of the Largest Connected Component, and Number of Connected Components) against the Basic baseline. The results demonstrate that our Gradient-based ER significantly optimizes KG. Specifically, it reduces the number of entities (by 12%) while substantially boosting graph density (by over 20% across datasets). This structural shift indicates that our ER module effectively identifies the same conceptual entities that possess different names. Consequently, the resulting graphs are more compact and cohesive, as evidenced by the reduced diameter and fewer connected components, which mitigates graph fragmentation and facilitates better connectivity for graph reasoning.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 190, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Figure 7: QA performance breakdown by different query types (Single-hop, Multi-hop, and Global). The blue bars represent Exact Match (EM) for MMLongBench and Accuracy for Qasper, while the red bars represent the F1-score.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 191, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "image", + "text": "", + "text_level": -1, + "page_idx": 10, + "pdf_id": 192, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-7.png", + "image_caption": [ + "cref='#/texts/259'" + ], + "image_footnote": [], + "middle_json": { + "docling_label": "picture" + } + }, + { + "type": "text", + "text": "· QA performance under different query types. Figure 7 breaks down the performance of BookRAG across Single-hop, Multihop, and Global aggregation query types. We observe that Multihop queries generally present a greater challenge compared to Single-hop ones, resulting in a slight performance decrease. This trend reflects the inherent difficulty of retrieving and reasoning over disjoint pieces of evidence. It further validates our agent-based planning strategy, which handles different query types separately.", + "text_level": -1, + "page_idx": 10, + "pdf_id": 193, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "· Error Response analysis. To diagnose the performance bottlenecks of BookRAG, we conduct a fine-grained error analysis on 200 sampled queries from each dataset, tracing the error propagation as shown in Figure 9. We categorize failures into four types:", + "text_level": -1, + "page_idx": 10, + "pdf_id": 194, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "Figure 8: Case study of responses across different query types from MMLongBench and Qasper. CYAN TEXT highlights correct content generated by BookRAG. GRAY TEXT describes the internal process, and marks omitted irrelevant parts.", + "text_level": -1, + "page_idx": 11, + "pdf_id": 195, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "image", + "text": "", + "text_level": -1, + "page_idx": 11, + "pdf_id": 196, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-8.png", + "image_caption": [ + "cref='#/texts/282'" + ], + "image_footnote": [], + "middle_json": { + "docling_label": "picture" + } + }, + { + "type": "text", + "text": "Figure 9: Error analysis on 200 sampled queries from MMLongBench and Qasper datasets.", + "text_level": -1, + "page_idx": 11, + "pdf_id": 197, + "middle_json": { + "docling_label": "caption" + } + }, + { + "type": "image", + "text": "", + "text_level": -1, + "page_idx": 11, + "pdf_id": 198, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-9.png", + "image_caption": [ + "cref='#/texts/348'" + ], + "image_footnote": [], + "middle_json": { + "docling_label": "picture" + } + }, + { + "type": "text", + "text": "PDF Parsing, Plan, Retrieval, and Generation errors. The results identify Retrieval Error as the dominant failure mode, followed by Generation Error, reflecting the persistent challenge of locating and synthesizing multimodal evidence. Regarding Plan Error, our qualitative analysis reveals a specific failure pattern: the planner tends to over-decompose detailed single-hop queries into unnecessary multi-hop sub-tasks. This fragmentation leads to disjointed retrieval paths, effectively preventing the model from synthesizing a cohesive final answer from the scattered sub-responses.", + "text_level": -1, + "page_idx": 11, + "pdf_id": 199, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "· Case study. Figure 8 illustrates BookRAG's answering workflow across Single-hop, Multi-hop, and Global queries. The results demonstrate that by leveraging specific operators ( Select , Decompose , and Filter ), BookRAG effectively prunes search spaces. For example, in the Single-hop case, the reasoning space is significantly reduced from 134 to 24 nodes. This capability allows the system to efficiently isolate relevant evidence from noise, ensuring precise answer generation.", + "text_level": -1, + "page_idx": 11, + "pdf_id": 200, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "7 CONCLUSION", + "text_level": 0, + "page_idx": 11, + "pdf_id": 201, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "In this paper, we propose BookRAG, a novel method built upon Book Index, a document-native, structured Tree-Graph index specifically designed to capture the intricate relations of structural documents. By employing an agent-based method to dynamically configure retrieval and reasoning operators, our approach achieves state-ofthe-art performance on multiple benchmarks, demonstrating significant superiority over existing baselines in both retrieval precision and answer accuracy. In the future, we will explore an integrated document-native database system that supports data formatting, knowledge extraction, and intelligent querying.", + "text_level": -1, + "page_idx": 11, + "pdf_id": 202, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "12", + "text_level": -1, + "page_idx": 11, + "pdf_id": 203, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "REFERENCES", + "text_level": 0, + "page_idx": 12, + "pdf_id": 204, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "[1] Simran Arora, Brandon Yang, Sabri Eyuboglu, Avanika Narayan, Andrew Hojel, Immanuel Trummer, and Christopher Ré. 2023. Language Models Enable Simple Systems for Generating Structured Views of Heterogeneous Data Lakes. Proceedings of the VLDB Endowment 17, 2 (2023), 92-105.", + "text_level": -1, + "page_idx": 12, + "pdf_id": 205, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[2] Akari Asai, Zeqiu Wu, Yizhong Wang, et al. 2024. Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection. In International Conference on Learning Representations (ICLR) .", + "text_level": -1, + "page_idx": 12, + "pdf_id": 206, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[3] Akari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, and Hannaneh Hajishirzi. 2023. Self-rag: Learning to retrieve, generate, and critique through self-reflection. arXiv preprint arXiv:2310.11511 (2023).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 207, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[4] Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, et al. 2025. Qwen2.5-vl technical report. arXiv preprint arXiv:2502.13923 (2025).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 208, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[5] Camille Barboule, Benjamin Piwowarski, and Yoan Chabot. 2025. Survey on Question Answering over Visually Rich Documents: Methods, Challenges, and Trends. arXiv preprint arXiv:2501.02235 (2025).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 209, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[6] Yukun Cao, Zengyi Gao, Zhiyang Li, Xike Xie, S. Kevin Zhou, and Jianliang Xu. 2025. LEGO-GraphRAG: Modularizing Graph-Based Retrieval-Augmented Generation for Design Space Exploration. Proc. VLDB Endow. 18, 10 (June 2025), 3269-3283. https://doi.org/10.14778/3748191.3748194", + "text_level": -1, + "page_idx": 12, + "pdf_id": 210, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[7] Chengliang Chai, Jiajun Li, Yuhao Deng, Yuanhao Zhong, Ye Yuan, Guoren Wang, and Lei Cao. 2025. Doctopus: Budget-aware structural table extraction from unstructured documents. Proceedings of the VLDB Endowment 18, 11 (2025), 3695-3707.", + "text_level": -1, + "page_idx": 12, + "pdf_id": 211, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[8] Ilias Chalkidis, Manos Fergadiotis, Prodromos Malakasiotis, Nikolaos Aletras, and Ion Androutsopoulos. 2020. LEGAL-BERT: The muppets straight out of law school. arXiv preprint arXiv:2010.02559 (2020).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 212, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[9] Sibei Chen, Yeye He, Weiwei Cui, Ju Fan, Song Ge, Haidong Zhang, Dongmei Zhang, and Surajit Chaudhuri. 2024. Auto-Formula: Recommend Formulas in Spreadsheets using Contrastive Learning for Table Representations. Proceedings of the ACM on Management of Data 2, 3 (2024), 1-27.", + "text_level": -1, + "page_idx": 12, + "pdf_id": 213, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[10] Sibei Chen, Nan Tang, Ju Fan, Xuemi Yan, Chengliang Chai, Guoliang Li, and Xiaoyong Du. 2023. Haipipe: Combining human-generated and machine-generated pipelines for data preparation. Proceedings of the ACM on Management of Data 1, 1 (2023), 1-26.", + "text_level": -1, + "page_idx": 12, + "pdf_id": 214, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[11] Jaemin Cho, Debanjan Mahata, Ozan Irsoy, Yujie He, and Mohit Bansal. 2024. M3docrag: Multi-modal retrieval is what you need for multi-page multidocument understanding. arXiv preprint arXiv:2411.04952 (2024).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 215, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[12] Vassilis Christophides, Vasilis Efthymiou, Themis Palpanas, George Papadakis, and Kostas Stefanidis. 2020. An overview of end-to-end entity resolution for big data. ACM Computing Surveys (CSUR) 53, 6 (2020), 1-42.", + "text_level": -1, + "page_idx": 12, + "pdf_id": 216, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[13] Gheorghe Comanici, Eric Bieber, Mike Schaekermann, Ice Pasupat, Noveen Sachdeva, Inderjit Dhillon, Marcel Blistein, Ori Ram, Dan Zhang, Evan Rosen, et al. 2025. Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities. arXiv preprint arXiv:2507.06261 (2025).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 217, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[14] Pradeep Dasigi, Kyle Lo, Iz Beltagy, Arman Cohan, Noah A Smith, and Matt Gardner. 2021. A dataset of information-seeking questions and answers anchored in research papers. arXiv preprint arXiv:2105.03011 (2021).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 218, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[15] Xavier Daull, Patrice Bellot, Emmanuel Bruno, Vincent Martin, and Elisabeth Murisasco. 2023. Complex QA and language models hybrid architectures, Survey. arXiv preprint arXiv:2302.09051 (2023).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 219, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[16] Darren Edge, Ha Trinh, Newman Cheng, Joshua Bradley, Alex Chao, Apurva Mody, Steven Truitt, and Jonathan Larson. 2024. From local to global: A graph rag approach to query-focused summarization. arXiv preprint arXiv:2404.16130 (2024).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 220, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[17] Yunfan Gao, Yun Xiong, Xinyu Gao, Kangxiang Jia, Jinliu Pan, Yuxi Bi, Yi Dai, Jiawei Sun, and Haofen Wang. 2023. Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997 (2023).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 221, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[18] Zirui Guo, Lianghao Xia, Yanhua Yu, Tu Ao, and Chao Huang. 2024. LightRAG: Simple and Fast Retrieval-Augmented Generation. arXiv e-prints (2024), arXiv2410.", + "text_level": -1, + "page_idx": 12, + "pdf_id": 222, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[19] Bernal Jiménez Gutiérrez, Yiheng Shu, Yu Gu, Michihiro Yasunaga, and Yu Su. 2024. HippoRAG: Neurobiologically Inspired Long-Term Memory for Large Language Models. arXiv preprint arXiv:2405.14831 (2024).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 223, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[20] Taher H Haveliwala. 2002. Topic-sensitive pagerank. In Proceedings of the 11th international conference on World Wide Web . 517-526.", + "text_level": -1, + "page_idx": 12, + "pdf_id": 224, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[21] Xiaoxin He, Yijun Tian, Yifei Sun, Nitesh V Chawla, Thomas Laurent, Yann LeCun, Xavier Bresson, and Bryan Hooi. 2024. G-retriever: Retrieval-augmented generation for textual graph understanding and question answering. arXiv preprint arXiv:2402.07630 (2024).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 225, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[22] Yucheng Hu and Yuxing Lu. 2024. Rag and rau: A survey on retrieval-augmented language model in natural language processing. arXiv preprint arXiv:2404.19543 (2024).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 226, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "[23] Soyeong Jeong, Jinheon Baek, et al. 2024. Adaptive-RAG: Learning to Adapt Retrieval-Augmented Large Language Models through Question Complexity. arXiv preprint arXiv:2403.14403 (2024).", + "text_level": -1, + "page_idx": 12, + "pdf_id": 227, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "13", + "text_level": -1, + "page_idx": 12, + "pdf_id": 228, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "table", + "text": "", + "text_level": -1, + "page_idx": 12, + "pdf_id": 229, + "img_path": "", + "table_caption": [], + "table_footnote": [], + "table_body": "| [24] | Soyeong Jeong, Jinheon Baek, Sukmin Cho, Sung Ju Hwang, and Jong C Park. 2024. Adaptive-rag: Learning to adapt retrieval-augmented large language mod- |\n|--------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| [25] | els through question complexity. arXiv preprint arXiv:2403.14403 (2024). Tengjun Jin, Yuxuan Zhu, and Daniel Kang. 2025. ELT-Bench: An End-to- End Benchmark for Evaluating AI Agents on ELT Pipelines. arXiv preprint |\n| [26] | arXiv:2504.04808 (2025). Geewook Kim, Teakgyu Hong, Moonbin Yim, JeongYeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, and Seunghyun Park. 2022. Ocr-free document understanding transformer. In European Confer- |\n| [27] | ence on Computer Vision . Springer, 498-517. Dawei Li, Shu Yang, Zhen Tan, Jae Young Baik, Sukwon Yun, Joseph Lee, Aaron Chacko, Bojian Hou, Duy Duong-Tran, Ying Ding, et al. 2024. DALK: Dynamic Co-Augmentation of LLMs and KG to answer Alzheimer's Disease Questions with Scientific Literature. arXiv preprint arXiv:2405.04819 (2024). |\n| [28] | Guoliang Li, Jiayi Wang, Chenyang Zhang, and Jiannan Wang. 2025. Data+ AI: LLM4Data and Data4LLM. In Companion of the 2025 International Conference on |\n| [29] | Management of Data . 837-843. Yinheng Li, Shaofei Wang, Han Ding, and Hang Chen. 2023. Large language models in finance: A survey. In Proceedings of the fourth ACM international conference on AI in finance . 374-382. |\n| [30] | Zhaodonghui Li, Haitao Yuan, Huiming Wang, Gao Cong, and Lidong Bing. 2025. LLM-R2: A Large Language Model Enhanced Rule-based Rewrite System for Boosting Query Efficiency. Proceedings of the VLDB Endowment 1, 18 (2025), 53-65. |\n| [31] | Haoyu Lu, Wen Liu, Bo Zhang, et al. 2024. DeepSeek-VL: Towards Real-World Vision-Language Understanding. arXiv preprint arXiv:2403.05525 (2024). |\n| [32] | Shengjie Ma, Chengjin Xu, Xuhui Jiang, Muzhi Li, Huaren Qu, Cehao Yang, Jiaxin Mao, and Jian Guo. 2024. Think-on-Graph 2.0: Deep and Faithful Large Language Model Reasoning with Knowledge-guided Retrieval Augmented Generation. arXiv preprint arXiv:2407.10805 (2024). |\n| [33] | Yubo Ma, Yuhang Zang, Liangyu Chen, Meiqi Chen, Yizhu Jiao, Xinze Li, Xinyuan Lu, Ziyu Liu, Yan Ma, Xiaoyi Dong, et al. 2024. Mmlongbench-doc: Benchmarking long-context document understanding with visualizations. Advances in Neural Information Processing Systems 37 (2024), 95963-96010. |\n| [34] | Alex Mallen, Akari Asai, Victor Zhong, Rajarshi Das, Daniel Khashabi, and Hannaneh Hajishirzi. 2022. When not to trust language models: Investigat- ing effectiveness of parametric and non-parametric memories. arXiv preprint arXiv:2212.10511 (2022). |\n| [35] | Zan Ahmad Naeem, Mohammad Shahmeer Ahmad, Mohamed Eltabakh, Mourad Ouzzani, and Nan Tang. 2024. RetClean: Retrieval-Based Data Cleaning Using LLMs and Data Lakes. Proceedings of the VLDB Endowment 17, 12 (2024), 4421- 4424. |\n| [36] | Avanika Narayan, Ines Chami, Laurel Orr, and Christopher Ré. 2022. Can Foun- dation Models Wrangle Your Data? Proceedings of the VLDB Endowment 16, 4 (2022), 738-746. |\n| [37] | Yuqi Nie, Yaxuan Kong, Xiaowen Dong, John M Mulvey, H Vincent Poor, Qing- song Wen, and Stefan Zohren. 2024. A Survey of Large Language Models for Financial Applications: Progress, Prospects and Challenges. arXiv preprint |\n| [38] | arXiv:2406.11903 (2024). Arash Dargahi Nobari and Davood Rafiei. 2024. TabulaX: Leveraging Large Language Models for Multi-Class Table Transformations. arXiv preprint arXiv:2411.17110 (2024). |\n| [39] | PageIndex. 2025. PageIndex: Next-Generation Reasoning-based RAG. https: //pageindex.ai/. |\n| [40] | Liana Patel, Siddharth Jha, Melissa Pan, Harshit Gupta, Parth Asawa, Carlos Guestrin, and Matei Zaharia. 2025. Semantic Operators and Their Optimization: Enabling LLM-Based Data Processing with Accuracy Guarantees in LOTUS. |\n| [41] | Proceedings of the VLDB Endowment 18, 11 (2025), 4171-4184. Boci Peng, Yun Zhu, Yongchao Liu, Xiaohe Bo, Haizhou Shi, Chuntao Hong, Yan Zhang, and Siliang Tang. 2024. Graph retrieval-augmented generation: A survey. |\n| [42] | arXiv preprint arXiv:2408.08921 (2024). Peter Pirolli and Stuart Card. 1995. Information foraging in information access environments. In Proceedings of the SIGCHI conference on Human factors in computing systems . 51-58. |\n| [43] | Yichen Qian, Yongyi He, Rong Zhu, Jintao Huang, Zhijian Ma, Haibin Wang, Framework for Data Manipulation with Large Language Models. |\n| | Yaohua Wang, Xiuyu Sun, Defu Lian, Bolin Ding, et al. 2024. UniDM: A Unified Proceedings of Machine Learning and Systems 6 (2024), 465-482. |\n| [44] | Stephen E Robertson and Steve Walker. 1994. Some simple effective approxi- mations to the 2-poisson model for probabilistic weighted retrieval. In SIGIR'94: Proceedings of the Seventeenth Annual International ACM-SIGIR Conference on Research and Development in Information Retrieval, organised by Dublin City |\n| [45] | University . Springer, 232-241. Parth Sarthi, Salman Abdullah, Aditi Tuli, Shubh Khanna, Anna Goldie, and Christopher D Manning. 2024. Raptor: Recursive abstractive processing for |\n| | tree-organized retrieval. arXiv preprint arXiv:2401.18059 (2024). |", + "middle_json": { + "docling_label": "table" + } + }, + { + "type": "text", + "text": "[46] Timo Schick, Jane Dwivedi-Yu, Roberto Dessì, Roberta Raileanu, Maria Lomeli, Eric Hambro, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom. 2024.", + "text_level": -1, + "page_idx": 12, + "pdf_id": 230, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "table", + "text": "", + "text_level": -1, + "page_idx": 13, + "pdf_id": 231, + "img_path": "", + "table_caption": [], + "table_footnote": [], + "table_body": "| | Toolformer: Language models can teach themselves to use tools. Advances in Neural Information Processing Systems 36 (2024). |\n|------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| [47] | Shreya Shankar, Tristan Chambers, Tarak Shah, Aditya G Parameswaran, and Eugene Wu. 2024. Docetl: Agentic query rewriting and evaluation for complex document processing. arXiv preprint arXiv:2410.12189 (2024). |\n| [48] | Shamane Siriwardhana, Rivindu Weerasekera, Elliott Wen, Tharindu Kalu- arachchi, Rajib Rana, and Suranga Nanayakkara. 2023. Improving the domain adaptation of retrieval augmented generation (RAG) models for open domain question answering. Transactions of the Association for Computational Linguistics 11 (2023), 1-17. |\n| [49] | Solutions Review Editors. 2019. 80 Percent of Your Data Will Be Unstructured in Five Years. https://solutionsreview.com/data-management/80-percent-of-your- datawill-be-unstructured-in-five-years/. Accessed: 2023-10-27. |\n| [50] | Zhaoyan Sun, Xuanhe Zhou, and Guoliang Li. 2024. R-Bot: An LLM-based Query |\n| [51] | Rewrite System. arXiv preprint arXiv:2412.01661 (2024). Vincent A Traag, Ludo Waltman, and Nees Jan Van Eck. 2019. From Louvain to Leiden: guaranteeing well-connected communities. Scientific reports 9, 1 (2019), 1-12. |\n| [52] | Bin Wang, Chao Xu, Xiaomeng Zhao, Linke Ouyang, Fan Wu, Zhiyuan Zhao, Rui Xu, Kaiwen Liu, Yuan Qu, Fukai Shang, et al. 2024. Mineru: An open-source solution for precise document content extraction. arXiv preprint arXiv:2409.18839 (2024). |\n| [53] | Jiayi Wang and Guoliang Li. 2025. Aop: Automated and interactive llm pipeline orchestration for answering complex queries. CIDR. |\n| [54] | Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024). |\n| [55] | Shu Wang, Yixiang Fang, Yingli Zhou, Xilin Liu, and Yuchi Ma. 2025. ArchRAG: Attributed Community-based Hierarchical Retrieval-Augmented Generation. arXiv preprint arXiv:2502.09891 (2025). |\n| [56] | Shen Wang, Tianlong Xu, Hang Li, Chaoli Zhang, Joleen Liang, Jiliang Tang, Philip S Yu, and Qingsong Wen. 2024. Large language models for education: A survey and outlook. arXiv preprint arXiv:2403.18105 (2024). |", + "middle_json": { + "docling_label": "table" + } + }, + { + "type": "table", + "text": "", + "text_level": -1, + "page_idx": 13, + "pdf_id": 232, + "img_path": "", + "table_caption": [], + "table_footnote": [], + "table_body": "| [57] | Shu Wang, Yingli Zhou, and Yixiang Fang. [n. d.]. BookRAG: A Hierarchical Structure-aware Index-based Approach for Complex Document Question An- swering. https://github.com/sam234990/BookRAG. |\n|--------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| [58] | Yu Wang, Nedim Lipka, Ryan A Rossi, Alexa Siu, Ruiyi Zhang, and Tyler Derr. 2024. Knowledge graph prompting for multi-document question answering. In Proceedings of the AAAI Conference on Artificial Intelligence , Vol. 38. 19206-19214. |\n| [59] | Shi-Qi Yan, Jia-Chen Gu, Yun Zhu, and Zhen-Hua Ling. 2024. Corrective Retrieval Augmented Generation. arXiv preprint arXiv:2401.15884 (2024). |\n| [60] | An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al. 2025. Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025). |\n| [61] | Murong Yue. 2025. A survey of large language model agents for question an- swering. arXiv preprint arXiv:2503.19213 (2025). |\n| [62] | Qinggang Zhang, Shengyuan Chen, Yuanchen Bei, Zheng Yuan, Huachi Zhou, Zijin Hong, Hao Chen, Yilin Xiao, Chuang Zhou, Junnan Dong, et al. 2025. A survey of graph retrieval-augmented generation for customized large language models. arXiv preprint arXiv:2501.13958 (2025). |\n| [63] | Xin Zhang, Yanzhao Zhang, Wen Xie, Mingxin Li, Ziqi Dai, Dingkun Long, Pengjun Xie, Meishan Zhang, Wenjie Li, and Min Zhang. 2024. GME: Im- proving Universal Multimodal Retrieval by Multimodal LLMs. arXiv preprint |\n| [64] | Yanzhao Zhang, Mingxin Li, Dingkun Long, Xin Zhang, Huan Lin, Baosong Yang, Pengjun Xie, An Yang, Dayiheng Liu, Junyang Lin, et al. 2025. Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models. arXiv preprint arXiv:2506.05176 (2025). |\n| [65] | Wayne Xin Zhao, Kun Zhou, Junyi Li, Tianyi Tang, Xiaolei Wang, Yupeng Hou, Yingqian Min, Beichen Zhang, Junjie Zhang, Zican Dong, et al. 2023. A survey of large language models. arXiv preprint arXiv:2303.18223 1, 2 (2023). |\n| [66] | Yingli Zhou, Yaodong Su, Youran Sun, Shu Wang, Taotao Wang, Runyuan He, Yongwei Zhang, Sicong Liang, Xilin Liu, Yuchi Ma, et al. 2025. In-depth Analysis of Graph-based RAG in a Unified Framework. arXiv preprint arXiv:2503.04338 (2025). |\n| [67] | Yutao Zhu, Huaying Yuan, Shuting Wang, Jiongnan Liu, Wenhan Liu, Chenlong Deng, Haonan Chen, Zheng Liu, Zhicheng Dou, and Ji-Rong Wen. 2023. Large language models for information retrieval: A survey. ACM Transactions on Information Systems (2023). |", + "middle_json": { + "docling_label": "table" + } + }, + { + "type": "text", + "text": "14", + "text_level": -1, + "page_idx": 13, + "pdf_id": 233, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "A EXPERIMENTAL DETAILS", + "text_level": 0, + "page_idx": 14, + "pdf_id": 234, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "A.1 Evaluation Metrics", + "text_level": 0, + "page_idx": 14, + "pdf_id": 235, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "In this section, we provide the detailed definitions and calculation procedures for the metrics used in our main experiments.", + "text_level": -1, + "page_idx": 14, + "pdf_id": 236, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "A.1.1 Answer Extraction and Normalization. Standard RAG models typically generate free-form natural language responses, which may contain extraneous conversational text (e.g., 'The answer is...'). Directly comparing these raw outputs with concise ground truth labels (e.g., 'Option A' or '12.5') can lead to false negatives.", + "text_level": -1, + "page_idx": 14, + "pdf_id": 237, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Following official evaluation protocols, we employ an LLM-based extraction step to align the model output with the ground truth format before calculation. Let 𝑦 𝑟𝑎𝑤 denote the raw response generated by the RAG system and 𝑦 𝑔𝑜𝑙𝑑 denote the ground truth. We define the extracted answer ˆ as: 𝑦", + "text_level": -1, + "page_idx": 14, + "pdf_id": 238, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "equation", + "text": "ˆ 𝑦 = LLMextract ( 𝑦 𝑟𝑎𝑤 , Instruction ) (16)", + "text_level": -1, + "page_idx": 14, + "pdf_id": 239, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "where LLMextract extracts the key information (e.g., the key entity for span extraction) from 𝑦 𝑟𝑎𝑤 . We further apply standard normalization N(·) (e.g., lowercasing, removing punctuation) to both ˆ 𝑦 and 𝑦 𝑔𝑜𝑙𝑑 .", + "text_level": -1, + "page_idx": 14, + "pdf_id": 240, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "A.1.2 QA Performance Metrics. Based on the ground truth 𝑦 𝑔𝑜𝑙𝑑 and the model's response (either raw 𝑦 𝑟𝑎𝑤 or extracted ˆ), we com𝑦 pute the following metrics:", + "text_level": -1, + "page_idx": 14, + "pdf_id": 241, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Accuracy (Inclusion-based). Following prior works [3, 34, 46], we utilize accuracy as a soft-match metric. We consider a prediction correct if the normalized gold answer is included in the model's generated response, rather than requiring a strict exact match. This accounts for the uncontrollable nature of LLM generation.", + "text_level": -1, + "page_idx": 14, + "pdf_id": 242, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "equation", + "text": "Accuracy = 1 𝑁 𝑁 ∑︁ 𝑖 = 1 I (N( 𝑦 𝑔𝑜𝑙𝑑,𝑖 ) ⊆ N( 𝑦 𝑟𝑎𝑤,𝑖 )) (17)", + "text_level": -1, + "page_idx": 14, + "pdf_id": 243, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "where ⊆ denotes the substring inclusion relation.", + "text_level": -1, + "page_idx": 14, + "pdf_id": 244, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Exact Match (EM).. Unlike accuracy, Exact Match is a strict metric. It measures whether the normalized extracted answer ˆ is character𝑦 for-character identical to the ground truth.", + "text_level": -1, + "page_idx": 14, + "pdf_id": 245, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "equation", + "text": "EM = 1 𝑁 𝑁 ∑︁ 𝑖 = 1 I (N( ˆ 𝑦 𝑖 ) = N( 𝑦 𝑔𝑜𝑙𝑑,𝑖 )) (18)", + "text_level": -1, + "page_idx": 14, + "pdf_id": 246, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "F1-score. For questions requiring text span answers, we utilize the token-level F1-score between the extracted answer ˆ and the 𝑦 ground truth 𝑦 𝑔𝑜𝑙𝑑 . Treating them as bags of tokens 𝑇 ˆ 𝑦 and 𝑇 𝑔𝑜𝑙𝑑 :", + "text_level": -1, + "page_idx": 14, + "pdf_id": 247, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "equation", + "text": "𝑃 = | 𝑇 ˆ 𝑦 ∩ 𝑇 𝑔𝑜𝑙𝑑 | | 𝑇 ˆ 𝑦 | , 𝑅 = | 𝑇 ˆ 𝑦 ∩ 𝑇 𝑔𝑜𝑙𝑑 | | 𝑇 𝑔𝑜𝑙𝑑 | , F1 = 2 · 𝑃 · 𝑅 𝑃 + 𝑅 (19)", + "text_level": -1, + "page_idx": 14, + "pdf_id": 248, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "15", + "text_level": -1, + "page_idx": 14, + "pdf_id": 249, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "A.1.3 Retrieval Recall. As described in the main text, we evaluate retrieval quality based on the granularity of parsed PDF blocks (e.g., paragraphs, tables, images). For a given query 𝑞 , let B 𝑔𝑜𝑙𝑑 be the set of manually labeled ground-truth blocks required to answer 𝑞 , and B 𝑟𝑒𝑡 be the set of unique blocks retrieved by the system. The Retrieval Recall is defined as:", + "text_level": -1, + "page_idx": 14, + "pdf_id": 250, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "equation", + "text": "Recall 𝑟𝑒𝑡 = ( 0 if parsing error occurs on B 𝑔𝑜𝑙𝑑 | B 𝑟𝑒𝑡 ∩B 𝑔𝑜𝑙𝑑 | | B 𝑔𝑜𝑙𝑑 | otherwise (20)", + "text_level": -1, + "page_idx": 14, + "pdf_id": 251, + "middle_json": { + "docling_label": "formula" + } + }, + { + "type": "text", + "text": "Specifically, if a ground-truth block is lost due to PDF parsing failures (i.e., it does not exist in the candidate pool), it is considered strictly unretrievable, resulting in a recall contribution of 0 for that specific block.", + "text_level": -1, + "page_idx": 14, + "pdf_id": 252, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "A.2 Implementation details", + "text_level": 0, + "page_idx": 14, + "pdf_id": 253, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Weimplement BookRAG in Python, utilizing MinerU [52] for robust document layout parsing. For a fair comparison, both BookRAG and all baseline methods are powered by a unified set of state-of-theart (SOTA) and widely adopted backbone models from the Qwen family [4, 60, 63, 64], including LLM, vision-language model (VLM), and embedding models. Specifically, we utilize Qwen3-8B [60] as the default LLM, Qwen2.5VL-30B [4] as the vision-language model (VLM), Qwen3-Embedding-0.6B [64] for text embedding, gme-Qwen2-VL-2B-Instruct [63] for multi-modal embedding, and Qwen3-Reranker-4B [64] for reranking. We primarily select models under the 10B parameter scale to balance efficiency and effectiveness. However, for the VLM, we adopt the 30B version, as the 8B counterpart exhibited significant performance deficits, frequently failing to answer correctly even when provided with ground-truth images. All experiments were conducted on a Linux operating system running on a high-performance server equipped with an Intel Xeon 2.0GHz CPU, 1024GB of memory, and 8 NVIDIA GeForce RTX A5000 GPUs, each with 24 GB of VRAM. Specifically, to ensure a fair comparison of efficiency, all methods were executed serially, and the reported time costs reflect this sequential processing mode. For methods involving document chunking and retrieval ranking, we standardize the chunk size at 500 tokens and set the retrieval top𝑘 to 10 to ensure consistent candidate pool sizes across baselines. For further reproducibility, our source code and detailed implementation configurations are publicly available at our repository: https://github.com/sam234990/BookRAG.", + "text_level": -1, + "page_idx": 14, + "pdf_id": 254, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "A.3 Prompts", + "text_level": 0, + "page_idx": 14, + "pdf_id": 255, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Specifically, we present the prompts designed for agent-based query classification (Figure 10), question decomposition (Figure 11), and filter operator generation (Figure 12). Additionally, we illustrate the prompt employed for entity resolution judgment (Figure 13) during the graph construction phase.", + "text_level": -1, + "page_idx": 14, + "pdf_id": 256, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "You are an expert query analyzer. Your only task is to classify the user's question into one of three categories: \"simple\", \"complex\", or \"global\". Respond only with the specified JSON object.", + "text_level": -1, + "page_idx": 15, + "pdf_id": 257, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Category Definitions:", + "text_level": 0, + "page_idx": 15, + "pdf_id": 258, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "1. single-hop: The question can be fully answered by retrieving information from a SINGLE, contiguous location in the document (e.g., one specific paragraph, one complete table, or one figure).", + "text_level": -1, + "page_idx": 15, + "pdf_id": 259, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "-This includes questions that require reasoning or comparison, as long as all the necessary data is present within that single retrieved location.", + "text_level": -1, + "page_idx": 15, + "pdf_id": 260, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "-Example: \"What is the title of Figure 2?\"", + "text_level": -1, + "page_idx": 15, + "pdf_id": 261, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "-Example: \"How do 5% of the Latinos see economic upward mobility for their children?\" -> This is SIMPLE because the answer can be found by looking at a single chart or paragraph.", + "text_level": -1, + "page_idx": 15, + "pdf_id": 262, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "2. multi-hop: The question requires decomposition into multiple simple sub-questions, where each sub-question must be answered by a separate retrieval action.", + "text_level": -1, + "page_idx": 15, + "pdf_id": 263, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "-It often contains a nested or indirect constraint that requires a preliminary step to resolve before the main question can be answered.", + "text_level": -1, + "page_idx": 15, + "pdf_id": 264, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "-Example: \"What is the color of the personality vector...?\" -> This is COMPLEX because it requires two separate retrieval actions.", + "text_level": -1, + "page_idx": 15, + "pdf_id": 265, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "3. global: The question requires an aggregation operation (e.g., counting, listing, summarizing) over a set of items that are identified by a clear structural filter.", + "text_level": -1, + "page_idx": 15, + "pdf_id": 266, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "-Example: \"How many tables are in the document?\" -> This is GLOBAL because the process is to filter for all items of type 'table'.", + "text_level": -1, + "page_idx": 15, + "pdf_id": 267, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "User Query: query", + "text_level": -1, + "page_idx": 15, + "pdf_id": 268, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Figure 10: The prompt for query classification.", + "text_level": -1, + "page_idx": 15, + "pdf_id": 269, + "middle_json": { + "docling_label": "paragraph" + } + }, + { + "type": "text", + "text": "16", + "text_level": -1, + "page_idx": 15, + "pdf_id": 270, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "You are a query decomposition expert. You have been given a \"complex\" question. Your task is to break it down into a series of simple, atomic sub-questions and classify each one by type. **Crucial Instructions:** 1. Each ' retrieval ' sub-question MUST be a direct information retrieval task that can be answered independently by looking up a specific fact, number, or value in the document. 2. ** ' retrieval ' sub-questions MUST NOT depend on the answer of another sub-question.** They should be parallelizable. All logic for combining their results must be placed in a final ' synthesis ' question. 3. A ' synthesis ' question requires comparing, calculating, or combining the answers of the previous ' retrieval ' questions. It does **NOT** require a new lookup in the document. You MUST provide your response in a JSON object with a single key 'sub_questions', which contains a list of objects. Each object must have a 'question' (string) and a 'type' (string: \"retrieval\" or \"synthesis\"). ---EXAMPLE 1 (Correct Decomposition with Independent Lookups) ---Complex Query: \"What is the color of the personality vector in the soft-labled personality embedding matrix that with the highest Receptiviti score for User A2GBIFL43U1LKJ?\" Expected JSON Output: {{ \"sub_questions\": [ {{\"question\": \"What are all the Receptiviti scores for each personality vector for User A2GBIFL43U1LKJ?\", \"type\": \"retrieval\"}}, {{\"question\": \"What is the mapping of personality vectors to their colors in the soft-labled personality embedding matrix?\", \"type\": \"retrieval\"}}, {{\"question\": \"From the gathered scores, identify the personality vector with the highest score, and then find its corresponding color from the vector-to-color mapping.\", \"type\": \"synthesis\"}} ] }} ---END EXAMPLE 1 ------EXAMPLE 2 (Decomposition with retrieval and synthesis steps) ---Complex Query: \"According to the report, which one is greater in population in the survey? Foreign born Latinos, or the Latinos interviewed by cellphone?\" Expected JSON Output: {{ \"sub_questions\": [ {{\"question\": \"According to the report, what is the population of foreign born Latinos in the survey?\", \"type\": \"retrieval\"}}, {{\"question\": \"According to the report, what is the population of Latinos interviewed by cellphone in the survey?\", \"type\": \"retrieval\"}}, {{\"question\": \"Which of the two population counts is greater?\", \"type\": \"synthesis\"}} ] }} ---END EXAMPLE 2 --Now, perform the decomposition for the following query. User Query: query", + "text_level": -1, + "page_idx": 16, + "pdf_id": 271, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "Figure 11: The prompt for query decomposition.", + "text_level": -1, + "page_idx": 16, + "pdf_id": 272, + "middle_json": { + "docling_label": "paragraph" + } + }, + { + "type": "text", + "text": "17", + "text_level": -1, + "page_idx": 16, + "pdf_id": 273, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "You are a highly specialized AI assistant. Your only function is to analyze a \"Global Query\" and return a single, valid JSON object that specifies both the filtering steps and the final aggregation operation. You MUST NOT output any other text or explanation. ### INSTRUCTIONS \\& DEFINITIONS ### 1. **Filters**: You MUST determine the list of ' filters ' to apply. Even if the filter is for the whole document (e.g., all tables), the ' filters ' list must be present. -' filter_type ' : One of [\"section\", \"image\", \"table\", \"page\"]. -' section ' : Use for structural parts like chapters, sections, appendices, or references. -' image ' : Use for visual elements like figures, images, pictures, or plots. -' table ' : Use for tabular data. -' page ' : Use for specific page numbers or ranges. -' filter_value ' : (Optional) Can be provided for \"section\" (e.g., a section title) or \"page\" (e.g., '3-10' or '5'). **For \"image\" or \"table\", this value MUST be null.** 2. **Operation**: Determine the final aggregation operation. -' operation ' : One of [\"COUNT\", \"LIST\", \"SUMMARIZE\", \"ANALYZE\"]. ### EXAMPLES OF YOUR TASK ### User: \"How many figures are in this paper from Page 3 to Page 10?\" Assistant: {{\"filters\": [{{\"filter_type\": \"page\", \"filter_value\": \"3-10\"}}, {{\"filter_type\": \"image\"}}], \"operation\": \"COUNT\"}} User: \"Summarize the discussion about 'data augmentation' in the 'Methodology' section.\" Assistant: {{\"filters\": [{{\"filter_type\": \"section\", \"filter_value\": \"Methodology\"}}], \"operation\": \"SUMMARIZE\"}} User: \"How many chapters are in this report?\" Assistant: {{\"filters\": [{{\"filter_type\": \"section\"}}], \"operation\": \"COUNT\"}} ### YOUR CURRENT TASK ### User: \"{query}\" User Query: query", + "text_level": -1, + "page_idx": 17, + "pdf_id": 274, + "middle_json": { + "docling_label": "code" + } + }, + { + "type": "text", + "text": "Figure 12: The prompt for Filter operator generation.", + "text_level": -1, + "page_idx": 17, + "pdf_id": 275, + "middle_json": { + "docling_label": "paragraph" + } + }, + { + "type": "text", + "text": "18", + "text_level": -1, + "page_idx": 17, + "pdf_id": 276, + "middle_json": { + "docling_label": "page_footer" + } + }, + { + "type": "text", + "text": "-Goal-", + "text_level": 0, + "page_idx": 18, + "pdf_id": 277, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "You are an expert Entity Resolution Adjudicator. Your task is to determine if a \"New Entity\" refers to the exact same real-world concept as one of the \"Candidate Entities\" provided from a knowledge graph. Your output must be a JSON object containing the ID of the matching candidate (or -1) and a brief explanation for your decision. -Context-", + "text_level": -1, + "page_idx": 18, + "pdf_id": 278, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "You will be given one \"New Entity\" recently extracted from a text. You will also be given a list of \"Candidate Entities\" that are semantically similar, retrieved from an existing knowledge base. Each candidate has a unique ' id ' for you to reference.", + "text_level": -1, + "page_idx": 18, + "pdf_id": 279, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "---", + "text_level": -1, + "page_idx": 18, + "pdf_id": 280, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "-Core Task & Rules-", + "text_level": 0, + "page_idx": 18, + "pdf_id": 281, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "1. **Analyze the \"New Entity\"**: Carefully read its name, type, and description to understand what it is.", + "text_level": -1, + "page_idx": 18, + "pdf_id": 282, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "2. **Field-by-Field Adjudication**: To determine a match, you must evaluate each field with a specific focus:", + "text_level": -1, + "page_idx": 18, + "pdf_id": 283, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "* ** ' entity_name ' (High Importance):** The names must be extremely similar, a direct abbreviation (e.g., \"LLM\" vs. \"Large Language Model\"), or a well-known alias. **If the names represent distinct, parallel concepts (like \"Event Detection\" and \"Named Entity Recognition\"), they are NOT a match, even if their descriptions are very similar.**", + "text_level": -1, + "page_idx": 18, + "pdf_id": 284, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "* ** ' entity_type ' (Medium Importance):** The types do not need to be identical, but they must be closely related and compatible (e.g., ' COMPANY ' and ' ORGANIZATION ' could describe the same entity).", + "text_level": -1, + "page_idx": 18, + "pdf_id": 285, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "* ** ' description ' (Contextual Importance):** The descriptions may differ as they are often extracted from different parts of a document. Your task is to look past surface-level text similarity and determine if they fundamentally describe the **same underlying object or concept**.", + "text_level": -1, + "page_idx": 18, + "pdf_id": 286, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "3. **Be Strict and Conservative**: Your standard for a match must be very high. An incorrect merge can corrupt the knowledge graph. A missed merge is less harmful.", + "text_level": -1, + "page_idx": 18, + "pdf_id": 287, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "* Surface-level similarities are not enough. The underlying concepts must be identical.", + "text_level": -1, + "page_idx": 18, + "pdf_id": 288, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "* For example, \"Apple\" (the fruit) and \"Apple Inc.\" (the company) are NOT a match.", + "text_level": -1, + "page_idx": 18, + "pdf_id": 289, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "* **When in doubt, you MUST output -1.**", + "text_level": -1, + "page_idx": 18, + "pdf_id": 290, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "* **Assume No Match by Default**: In a large knowledge graph, most new entities are genuinely new. You should start with the assumption that the \"New Entity\" is unique. You must find **strong, convincing evidence** across all fields, especially the ' entity_name ' , to overturn this assumption and declare a match.", + "text_level": -1, + "page_idx": 18, + "pdf_id": 291, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "4. **Format the Output**: **You must provide your answer in a valid JSON format. The JSON object should contain two keys:**", + "text_level": -1, + "page_idx": 18, + "pdf_id": 292, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "* ' select_id ' : An integer. The ' id ' of the candidate you've determined to be an exact match. If no exact match is", + "text_level": -1, + "page_idx": 18, + "pdf_id": 293, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "found, this value MUST be ' -1 ' .", + "text_level": -1, + "page_idx": 18, + "pdf_id": 294, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "* ' explanation ' : A brief, one-sentence string explaining your reasoning. For a match, explain why they are the same entity. For no match, explain the key difference.", + "text_level": -1, + "page_idx": 18, + "pdf_id": 295, + "middle_json": { + "docling_label": "list_item" + } + }, + { + "type": "text", + "text": "---", + "text_level": -1, + "page_idx": 18, + "pdf_id": 296, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "-Output Schema & Format-", + "text_level": 0, + "page_idx": 18, + "pdf_id": 297, + "middle_json": { + "docling_label": "section_header" + } + }, + { + "type": "text", + "text": "Your response MUST be a single, valid JSON object that adheres to the following schema. Do not include any other text, explanation, or markdown formatting like ''' json.", + "text_level": -1, + "page_idx": 18, + "pdf_id": 298, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "''' json {{ \"select_id\": \"integer\", \"explanation\": \"string\" }} ''' ----Example-### Example 1: Match Found ### Example 2: No Match Found -----Task Execution-", + "text_level": -1, + "page_idx": 18, + "pdf_id": 299, + "middle_json": { + "docling_label": "code" + } + }, + { + "type": "text", + "text": "Now, perform the selection task based on the following data. Remember to output only a single integer.", + "text_level": -1, + "page_idx": 18, + "pdf_id": 300, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "-Input Data -", + "text_level": -1, + "page_idx": 18, + "pdf_id": 301, + "middle_json": { + "docling_label": "text" + } + }, + { + "type": "text", + "text": "Figure 13: The prompt for entity resolution judgement, examples are omitted due to lack of space.", + "text_level": -1, + "page_idx": 18, + "pdf_id": 302, + "middle_json": { + "docling_label": "paragraph" + } + }, + { + "type": "text", + "text": "19", + "text_level": -1, + "page_idx": 18, + "pdf_id": 303, + "middle_json": { + "docling_label": "page_footer" + } + } +] \ No newline at end of file diff --git a/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-1.png b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-1.png new file mode 100644 index 0000000..5aa629a Binary files /dev/null and b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-1.png differ diff --git a/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-2.png b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-2.png new file mode 100644 index 0000000..53ae5c3 Binary files /dev/null and b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-2.png differ diff --git a/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-3.png b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-3.png new file mode 100644 index 0000000..e4e16c1 Binary files /dev/null and b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-3.png differ diff --git a/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-4.png b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-4.png new file mode 100644 index 0000000..d868fa2 Binary files /dev/null and b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-4.png differ diff --git a/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-5.png b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-5.png new file mode 100644 index 0000000..f9bfa76 Binary files /dev/null and b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-5.png differ diff --git a/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-6.png b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-6.png new file mode 100644 index 0000000..e98be48 Binary files /dev/null and b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-6.png differ diff --git a/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-7.png b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-7.png new file mode 100644 index 0000000..e6c1466 Binary files /dev/null and b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-7.png differ diff --git a/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-8.png b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-8.png new file mode 100644 index 0000000..28b2ecd Binary files /dev/null and b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-8.png differ diff --git a/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-9.png b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-9.png new file mode 100644 index 0000000..59fb5d6 Binary files /dev/null and b/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-9.png differ diff --git a/e2e_test_output/graph_data_basic.json b/e2e_test_output/graph_data_basic.json new file mode 100644 index 0000000..3d28299 --- /dev/null +++ b/e2e_test_output/graph_data_basic.json @@ -0,0 +1,45216 @@ +{ + "graph": { + "directed": false, + "multigraph": false, + "graph": {}, + "nodes": [ + { + "entity_name": "bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents", + "entity_type": "SECTION_TITLE", + "description": "As the primary title of the document, this section introduces BookRAG, a novel approach designed to handle complex documents by utilizing hierarchical structure awareness and index-based mechanisms within a Retrieval-Augmented Generation framework.", + "source_ids": [ + 1 + ], + "id": "Name: bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents\nType: SECTION_TITLE" + }, + { + "entity_name": "bookrag", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The specific name of the proposed model or architecture introduced in the document.", + "source_ids": [ + 1 + ], + "id": "Name: bookrag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "hierarchical structure-aware index-based approach", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The core methodology employed by BookRAG, focusing on leveraging document hierarchy and indexing strategies.", + "source_ids": [ + 1 + ], + "id": "Name: hierarchical structure-aware index-based approach\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "retrieval-augmented generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "The broader AI task domain addressed by the proposed approach, involving combining retrieval systems with generative models.", + "source_ids": [ + 1 + ], + "id": "Name: retrieval-augmented generation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "complex documents", + "entity_type": "DATASET_OR_CORPUS", + "description": "The target data type or corpus category that the system is specifically designed to process.", + "source_ids": [ + 1 + ], + "id": "Name: complex documents\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "shu wang", + "entity_type": "PERSON", + "description": "Shu Wang is an author affiliated with the Chinese University of Hong Kong Shenzhen and is one of the authors of the paper titled \"Bookrag\".", + "source_ids": [ + 2, + 5 + ], + "id": "Name: shu wang\nType: PERSON" + }, + { + "entity_name": "yingli zhou", + "entity_type": "PERSON", + "description": "Yingli Zhou is an author affiliated with the Chinese University of Hong Kong Shenzhen and is one of the authors of the paper titled \"Bookrag\".", + "source_ids": [ + 2, + 5 + ], + "id": "Name: yingli zhou\nType: PERSON" + }, + { + "entity_name": "yixiang fang", + "entity_type": "PERSON", + "description": "Yixiang Fang is an author affiliated with the Chinese University of Hong Kong Shenzhen and is one of the authors of the paper titled \"Bookrag\".", + "source_ids": [ + 2, + 5 + ], + "id": "Name: yixiang fang\nType: PERSON" + }, + { + "entity_name": "the chinese university of hong kong shenzhen", + "entity_type": "ORGANIZATION", + "description": "the chinese university of hong kong shenzhen is the institution where the authors are affiliated", + "source_ids": [ + 2 + ], + "id": "Name: the chinese university of hong kong shenzhen\nType: ORGANIZATION" + }, + { + "entity_name": "large language models", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "large language models are the models whose performance is being boosted by the proposed method", + "source_ids": [ + 2 + ], + "id": "Name: large language models\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "retrievalaugmented generation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "retrievalaugmented generation is a method that queries external documents to boost llm performance", + "source_ids": [ + 2 + ], + "id": "Name: retrievalaugmented generation\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "bookrag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Bookrag is a novel method and rag approach specifically designed for documents with hierarchical structures, such as books. It constructs a document-native bookindex to optimize retrieval for book content, distinguishing it as a specialized RAG system tailored for this type of material.", + "source_ids": [ + 25, + 2, + 159 + ], + "id": "Name: bookrag\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "bookindex", + "entity_type": "SOFTWARE", + "description": "bookindex is a novel index structure built by extracting a hierarchical tree from documents", + "source_ids": [ + 2 + ], + "id": "Name: bookindex\nType: SOFTWARE" + }, + { + "entity_name": "information foraging theory", + "entity_type": "SCIENTIFIC_THEORY", + "description": "Information foraging theory (IFT) is a foundational theoretical framework that explains how individuals seek information efficiently by treating information access as a process analogous to animal foraging. Serving as the conceptual basis for the system's design discussed in section 3.2, it provides the theoretical inspiration behind the proposed agent-based retrieval approach and the agent-based query method described in section 5. Furthermore, IFT is the cognitive principle embodied by BookRag's execution phase and grounds the retrieval process detailed in the text.", + "source_ids": [ + 2, + 35, + 41, + 42, + 78, + 22, + 26, + 124 + ], + "id": "Name: information foraging theory\nType: SCIENTIFIC_THEORY" + }, + { + "entity_name": "question answering", + "entity_type": "TASK_OR_PROBLEM", + "description": "Question answering is a task aimed at answering user queries based on long form documents, which has been revolutionized by large language models. It is the specific task addressed by the G Retriever model and the survey over visually rich documents, where proposed methods aim to improve performance.", + "source_ids": [ + 2, + 195, + 37, + 9, + 211 + ], + "id": "Name: question answering\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "books", + "entity_type": "BOOK", + "description": "books are examples of real world documents with hierarchical structures", + "source_ids": [ + 2 + ], + "id": "Name: books\nType: BOOK" + }, + { + "entity_name": "booklets", + "entity_type": "BOOK", + "description": "booklets are examples of real world documents with hierarchical structures", + "source_ids": [ + 2 + ], + "id": "Name: booklets\nType: BOOK" + }, + { + "entity_name": "handbooks", + "entity_type": "BOOK", + "description": "handbooks are examples of real world documents with hierarchical structures", + "source_ids": [ + 2 + ], + "id": "Name: handbooks\nType: BOOK" + }, + { + "entity_name": "three widely adopted benchmarks", + "entity_type": "BENCHMARK", + "description": "three widely adopted benchmarks were used to demonstrate the performance of bookrag", + "source_ids": [ + 2 + ], + "id": "Name: three widely adopted benchmarks\nType: BENCHMARK" + }, + { + "entity_name": "industry", + "entity_type": "ORGANIZATION", + "description": "The industry is a sector that has attracted attention to retrieval-augmented generation, referring to the collective group of organizations increasingly adopting large language models for question-answering systems.", + "source_ids": [ + 9, + 2 + ], + "id": "Name: industry\nType: ORGANIZATION" + }, + { + "entity_name": "academia", + "entity_type": "ORGANIZATION", + "description": "academia is a sector that has attracted attention to retrievalaugmented generation", + "source_ids": [ + 2 + ], + "id": "Name: academia\nType: ORGANIZATION" + }, + { + "entity_name": "graph", + "entity_type": "SOFTWARE", + "description": "a graph is used to capture intricate relationships between entities in the bookindex", + "source_ids": [ + 2 + ], + "id": "Name: graph\nType: SOFTWARE" + }, + { + "entity_name": "tree", + "entity_type": "SOFTWARE", + "description": "a hierarchical tree is extracted from documents to serve as the role of a table of contents", + "source_ids": [ + 2 + ], + "id": "Name: tree\nType: SOFTWARE" + }, + { + "entity_name": "table of contents", + "entity_type": "SOFTWARE", + "description": "the table of contents is the role served by the hierarchical tree in the bookindex", + "source_ids": [ + 2 + ], + "id": "Name: table of contents\nType: SOFTWARE" + }, + { + "entity_name": "retrieval recall", + "entity_type": "EVALUATION_METRIC", + "description": "Retrieval recall is an evaluation metric used to measure the performance of systems, particularly for evaluating methods such as PDF parsing and comparing layout-based approaches. It serves as the specific performance metric employed to compare BookRag against other baselines, where BookRag has been shown to significantly outperform these alternatives.", + "source_ids": [ + 2, + 144, + 23, + 155, + 157 + ], + "id": "Name: retrieval recall\nType: EVALUATION_METRIC" + }, + { + "entity_name": "qa accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "QA accuracy is a metric used to measure the performance of a system, where BookRAG has been shown to significantly outperform baselines.", + "source_ids": [ + 2, + 23 + ], + "id": "Name: qa accuracy\nType: EVALUATION_METRIC" + }, + { + "entity_name": "efficiency", + "entity_type": "EVALUATION_METRIC", + "description": "Efficiency is a metric used to evaluate the performance of BookRag and baseline methods, where BookRag maintains competitive performance.", + "source_ids": [ + 137, + 2 + ], + "id": "Name: efficiency\nType: EVALUATION_METRIC" + }, + { + "entity_name": "baselines", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "baselines are existing methods that bookrag outperforms in retrieval recall and qa accuracy", + "source_ids": [ + 2 + ], + "id": "Name: baselines\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "abstract", + "entity_type": "SECTION_TITLE", + "description": "As the opening section of the paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section provides a concise summary of the research problem, the proposed BookRAG solution involving hierarchical indexing and agent-based querying, and the reported state-of-the-art experimental results.", + "source_ids": [ + 3 + ], + "id": "Name: abstract\nType: SECTION_TITLE" + }, + { + "entity_name": "pvldb", + "entity_type": "PUBLICATION_VENUE", + "description": "PVldb is a publication venue referenced for its reference format, where a paper was published in 2025, and it is also mentioned in the context of artifact availability.", + "source_ids": [ + 4, + 5, + 6 + ], + "id": "Name: pvldb\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "reference format", + "entity_type": "SECTION_TITLE", + "description": "reference format is a section or concept mentioned in the context of pvldb", + "source_ids": [ + 4 + ], + "id": "Name: reference format\nType: SECTION_TITLE" + }, + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "BookRag is a novel, hierarchical structure-aware retrieval system designed for retrieval-augmented generation on complex documents. Implemented in Python for robust document layout parsing, it intelligently navigates a book index to adapt to query requirements and addresses three common query categories. The system utilizes various operators categorized by function to answer queries and prune search spaces, involving a structured execution process that includes query classification. BookRag has been extensively evaluated through experiments, ablation studies, and error analysis to assess its effectiveness, efficiency, and performance bottlenecks across different query types. It significantly outperforms existing baselines, achieving state-of-the-art performance in complex document question answering while demonstrating efficiency in terms of query time and token consumption.", + "source_ids": [ + 131, + 5, + 137, + 12, + 16, + 149, + 23, + 152, + 151, + 27, + 29, + 157, + 160, + 163, + 164, + 170, + 172, + 179, + 180, + 186, + 188, + 88, + 89, + 238 + ], + "id": "Name: bookrag\nType: PRODUCT" + }, + { + "entity_name": "2025", + "entity_type": "DATE", + "description": "2025 is the year associated with the publication of several works, including a paper in PVLDB, the Qwen2.5 VL technical report, and a survey paper. It marks the year these publications were released, as well as the year linked to the authors' work and the volume and issue numbers of the respective journals.", + "source_ids": [ + 194, + 195, + 196, + 197, + 5, + 203 + ], + "id": "Name: 2025\nType: DATE" + }, + { + "entity_name": "19", + "entity_type": "MEASUREMENT", + "description": "19 is the volume number of the pvldb publication where the paper appeared", + "source_ids": [ + 5 + ], + "id": "Name: 19\nType: MEASUREMENT" + }, + { + "entity_name": "1", + "entity_type": "MEASUREMENT", + "description": "1 is the issue number of the pvldb publication where the paper appeared, the value assigned to the root level in the hierarchical structure, and refers to the page count or a specific metric mentioned in the context of the publication details.", + "source_ids": [ + 200, + 57, + 5 + ], + "id": "Name: 1\nType: MEASUREMENT" + }, + { + "entity_name": "xxx xxx", + "entity_type": "MEASUREMENT", + "description": "xxx xxx represents the page range of the paper in the publication", + "source_ids": [ + 5 + ], + "id": "Name: xxx xxx\nType: MEASUREMENT" + }, + { + "entity_name": "xx xx xxx xx", + "entity_type": "MEASUREMENT", + "description": "xx xx xxx xx is the doi identifier for the paper", + "source_ids": [ + 5 + ], + "id": "Name: xx xx xxx xx\nType: MEASUREMENT" + }, + { + "entity_name": "retrieval augmented generation", + "entity_type": "TECHNOLOGY", + "description": "Retrieval augmented generation is a technology domain addressed by the BookRAG approach, the specific technology discussed in the survey, and the technology category that LightRAG belongs to as described in the text.", + "source_ids": [ + 208, + 5, + 207 + ], + "id": "Name: retrieval augmented generation\nType: TECHNOLOGY" + }, + { + "entity_name": "hierarchical structure aware index based approach", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "hierarchical structure aware index based approach is the specific method used by bookrag", + "source_ids": [ + 5 + ], + "id": "Name: hierarchical structure aware index based approach\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "complex documents", + "entity_type": "TASK_OR_PROBLEM", + "description": "complex documents are the type of documents that the bookrag approach is designed to handle", + "source_ids": [ + 5 + ], + "id": "Name: complex documents\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "artifact availability", + "entity_type": "TASK_OR_PROBLEM", + "description": "artifact availability refers to the status or process of making artifacts available as discussed in the text", + "source_ids": [ + 6 + ], + "id": "Name: artifact availability\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "bookrag", + "entity_type": "SOFTWARE", + "description": "Bookrag is a software project hosted on GitHub that employs an agent-based approach to address complex document queries through planning and execution on a book index. This extensible system performs classification and planning stages to handle queries, utilizing agent-based planning, retrieval, and generation processes. After reasoning, Bookrag obtains a retrieval set of highly relevant information blocks and executes a generated workflow that embodies cognitive principles of information foraging theory. The system is designed to resolve a broader range of query types and can generate correct content, which is highlighted in cyan text.", + "source_ids": [ + 96, + 7, + 79, + 82, + 83, + 85, + 181, + 124 + ], + "id": "Name: bookrag\nType: SOFTWARE" + }, + { + "entity_name": "github", + "entity_type": "ORGANIZATION", + "description": "github is the platform where the source code and data for bookrag are made available", + "source_ids": [ + 7 + ], + "id": "Name: github\nType: ORGANIZATION" + }, + { + "entity_name": "sam234990", + "entity_type": "PERSON", + "description": "sam234990 is the username associated with the BookRAG repository on GitHub, where the source code is hosted.", + "source_ids": [ + 238, + 7 + ], + "id": "Name: sam234990\nType: PERSON" + }, + { + "entity_name": "source code", + "entity_type": "PRODUCT", + "description": "Source code is a digital artifact made available as part of the BookRag project, referring specifically to the implementation files of BookRag that are provided for download.", + "source_ids": [ + 238, + 7 + ], + "id": "Name: source code\nType: PRODUCT" + }, + { + "entity_name": "data", + "entity_type": "PRODUCT", + "description": "data is a digital artifact made available as part of the bookrag project", + "source_ids": [ + 7 + ], + "id": "Name: data\nType: PRODUCT" + }, + { + "entity_name": "artifacts", + "entity_type": "PRODUCT", + "description": "artifacts are items made available alongside the source code and data for the bookrag project", + "source_ids": [ + 7 + ], + "id": "Name: artifacts\nType: PRODUCT" + }, + { + "entity_name": "1 introduction", + "entity_type": "SECTION_TITLE", + "description": "As the opening section of the paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section introduces the motivation behind Retrieval-Augmented Generation (RAG), highlights limitations in existing approaches regarding hierarchical documents, and presents the proposed BookRAG framework and its key components like BookIndex.", + "source_ids": [ + 8 + ], + "id": "Name: 1 introduction\nType: SECTION_TITLE" + }, + { + "entity_name": "large language models", + "entity_type": "TECHNOLOGY", + "description": "Large language models are a type of technology that has revolutionized question answering systems and serve as the subject of surveys and the technology being augmented.", + "source_ids": [ + 9, + 207 + ], + "id": "Name: large language models\nType: TECHNOLOGY" + }, + { + "entity_name": "qwen 3", + "entity_type": "PRODUCT", + "description": "qwen 3 is a specific large language model mentioned as an example", + "source_ids": [ + 9 + ], + "id": "Name: qwen 3\nType: PRODUCT" + }, + { + "entity_name": "gemini 2 5", + "entity_type": "PRODUCT", + "description": "Gemini 2.5 is a specific large language model product that pushes the frontier with advanced reasoning, multimodality, long context, and next-generation agentic capabilities.", + "source_ids": [ + 9, + 203 + ], + "id": "Name: gemini 2 5\nType: PRODUCT" + }, + { + "entity_name": "qa system", + "entity_type": "PRODUCT", + "description": "qa system is a product built using llms to assist users and reduce manual effort", + "source_ids": [ + 9 + ], + "id": "Name: qa system\nType: PRODUCT" + }, + { + "entity_name": "users", + "entity_type": "PERSON", + "description": "users are the individuals who are assisted by the qa systems built by the industry", + "source_ids": [ + 9 + ], + "id": "Name: users\nType: PERSON" + }, + { + "entity_name": "creative commons by nc nd 4 0 international license", + "entity_type": "LAW", + "description": "creative commons by nc nd 4 0 international license is the specific license under which this work is distributed", + "source_ids": [ + 10 + ], + "id": "Name: creative commons by nc nd 4 0 international license\nType: LAW" + }, + { + "entity_name": "vldb endowment", + "entity_type": "ORGANIZATION", + "description": "The VLDB Endowment is the organization that holds the publication rights for this work and is associated with the publication venue.", + "source_ids": [ + 10, + 191 + ], + "id": "Name: vldb endowment\nType: ORGANIZATION" + }, + { + "entity_name": "info vldb org", + "entity_type": "EMAIL", + "description": "info vldb org is the email address provided for obtaining permission for uses beyond the license", + "source_ids": [ + 10 + ], + "id": "Name: info vldb org\nType: EMAIL" + }, + { + "entity_name": "creative commons", + "entity_type": "ORGANIZATION", + "description": "creative commons is the organization that created the by nc nd 4 0 international license", + "source_ids": [ + 10 + ], + "id": "Name: creative commons\nType: ORGANIZATION" + }, + { + "entity_name": "owner author s", + "entity_type": "PERSON", + "description": "owner author s refers to the individuals or entities holding the copyright for the work", + "source_ids": [ + 10 + ], + "id": "Name: owner author s\nType: PERSON" + }, + { + "entity_name": "proceedings of the vldb endowment", + "entity_type": "PUBLICATION_VENUE", + "description": "Proceedings of the VLDB Endowment is the publication venue where the paper appeared and was published in 2025.", + "source_ids": [ + 11, + 197, + 191 + ], + "id": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "vol 19", + "entity_type": "MEASUREMENT", + "description": "vol 19 refers to the volume number of the publication", + "source_ids": [ + 11 + ], + "id": "Name: vol 19\nType: MEASUREMENT" + }, + { + "entity_name": "no 1", + "entity_type": "MEASUREMENT", + "description": "no 1 refers to the issue number of the publication", + "source_ids": [ + 11 + ], + "id": "Name: no 1\nType: MEASUREMENT" + }, + { + "entity_name": "issn 2150 8097", + "entity_type": "MEASUREMENT", + "description": "issn 2150 8097 is the international standard serial number assigned to the publication", + "source_ids": [ + 11 + ], + "id": "Name: issn 2150 8097\nType: MEASUREMENT" + }, + { + "entity_name": "doi xx xx xxx xx", + "entity_type": "MEASUREMENT", + "description": "doi xx xx xxx xx is the digital object identifier assigned to the document", + "source_ids": [ + 11 + ], + "id": "Name: doi xx xx xxx xx\nType: MEASUREMENT" + }, + { + "entity_name": "figure 1", + "entity_type": "IMAGE", + "description": "Figure 1 is an image that presents a comparison of existing methods and BookRAG for complex document QA, while also illustrating the two paradigms of existing RAG approaches for document-level QA.", + "source_ids": [ + 12, + 15 + ], + "id": "Name: figure 1\nType: IMAGE" + }, + { + "entity_name": "existing methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "existing methods refers to current techniques used for complex document qa which are being compared to bookrag", + "source_ids": [ + 12 + ], + "id": "Name: existing methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "complex document qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "Complex document QA is a specific task and research problem domain that involves the challenge of answering questions based on complex documents. It serves as the core research problem formalized in section 3.1 and is the context in which comparisons between methods and BookRags are conducted.", + "source_ids": [ + 35, + 12, + 36 + ], + "id": "Name: complex document qa\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "cref='#/texts/14'", + "entity_type": "IMAGE", + "description": "A diagram comparing three RAG (Retrieval-Augmented Generation) architectures: Text-Only RAG, Layout Segmented RAG, and BookRAG.", + "source_ids": [ + 13 + ], + "id": "Name: cref='#/texts/14'\nType: IMAGE" + }, + { + "entity_name": "complex query", + "entity_type": "TASK_OR_PROBLEM", + "description": "A complex query is a type of query that the decompose method breaks down into simpler sub queries, often represented by a user icon with a question mark to signify the initiation of the process.", + "source_ids": [ + 98, + 13 + ], + "id": "Name: complex query\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "complex multi-page document", + "entity_type": "PRODUCT", + "description": "The source document containing multiple pages that serves as the input data.", + "source_ids": [ + 13 + ], + "id": "Name: complex multi-page document\nType: PRODUCT" + }, + { + "entity_name": "text-only rag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Section (a) of the diagram illustrating a Retrieval-Augmented Generation approach using plain text extraction.", + "source_ids": [ + 13 + ], + "id": "Name: text-only rag\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "plain text extraction (ocr)", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The first step in Text-Only RAG where text is extracted from the document images.", + "source_ids": [ + 13 + ], + "id": "Name: plain text extraction (ocr)\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "unstructured chunks", + "entity_type": "DATASET_OR_CORPUS", + "description": "The output of OCR processing, representing fragmented text segments without structural context.", + "source_ids": [ + 13 + ], + "id": "Name: unstructured chunks\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "text index (vector/graph/tree)", + "entity_type": "SYSTEM_COMPONENT", + "description": "The indexing structure created to store and organize the unstructured chunks for retrieval.", + "source_ids": [ + 13 + ], + "id": "Name: text index (vector/graph/tree)\nType: SYSTEM_COMPONENT" + }, + { + "entity_name": "fixed/ graph retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The retrieval mechanism used to find relevant information from the index.", + "source_ids": [ + 13 + ], + "id": "Name: fixed/ graph retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "llm", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The Large Language Model (LLM), depicted as a robot head, is a type of model within the Qwen family used in various experiments and processing pipelines. It generates final answers based on retrieved information and is guided by prompts for decomposition and extraction tasks. The LLM analyzes retrieved blocks to determine hierarchical levels and node types of document candidates, reclassifies blocks by analyzing title candidates, and generates global questions from selected document elements.", + "source_ids": [ + 101, + 13, + 141, + 238, + 18, + 57, + 59 + ], + "id": "Name: llm\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "fails on structural dependencies", + "entity_type": "TASK_OR_PROBLEM", + "description": "A limitation identified in the Text-Only RAG approach regarding its inability to handle complex structures.", + "source_ids": [ + 13 + ], + "id": "Name: fails on structural dependencies\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "layout segmented rag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Section (b) of the diagram illustrating a RAG approach that segments content based on layout analysis.", + "source_ids": [ + 13 + ], + "id": "Name: layout segmented rag\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "layout analysis & parsing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The initial step in this section where the document's visual layout is analyzed and parsed.", + "source_ids": [ + 13 + ], + "id": "Name: layout analysis & parsing\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "flattened chunks", + "entity_type": "DATASET_OR_CORPUS", + "description": "Chunks derived from layout analysis but flattened, losing some hierarchical relationships.", + "source_ids": [ + 13 + ], + "id": "Name: flattened chunks\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "flattened vector index", + "entity_type": "SYSTEM_COMPONENT", + "description": "An index built upon the flattened chunks.", + "source_ids": [ + 13 + ], + "id": "Name: flattened vector index\nType: SYSTEM_COMPONENT" + }, + { + "entity_name": "fixed retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The retrieval method used in the Layout Segmented RAG pipeline.", + "source_ids": [ + 13 + ], + "id": "Name: fixed retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "loses complex relationships", + "entity_type": "TASK_OR_PROBLEM", + "description": "A drawback noted for the Layout Segmented RAG approach due to flattening the data.", + "source_ids": [ + 13 + ], + "id": "Name: loses complex relationships\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "bookrag (natively structure-aware)", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Section (c) of the diagram presenting the proposed solution, a structure-aware RAG architecture.", + "source_ids": [ + 13 + ], + "id": "Name: bookrag (natively structure-aware)\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "hierarchical chunks", + "entity_type": "DATASET_OR_CORPUS", + "description": "Chunks that preserve the hierarchical structure of the document.", + "source_ids": [ + 13 + ], + "id": "Name: hierarchical chunks\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "bookindex", + "entity_type": "SYSTEM_COMPONENT", + "description": "A graph-based index representing the hierarchical relationships between chunks.", + "source_ids": [ + 13 + ], + "id": "Name: bookindex\nType: SYSTEM_COMPONENT" + }, + { + "entity_name": "agent-based retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A retrieval strategy utilizing an agent to navigate the BookIndex graph effectively.", + "source_ids": [ + 13 + ], + "id": "Name: agent-based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "accurate, structured-grounded", + "entity_type": "EVALUATION_METRIC", + "description": "The positive outcome achieved by the BookRAG system, indicating high accuracy and structural awareness.", + "source_ids": [ + 13 + ], + "id": "Name: accurate, structured-grounded\nType: EVALUATION_METRIC" + }, + { + "entity_name": "financial auditing", + "entity_type": "TASK_OR_PROBLEM", + "description": "financial auditing is a task where llms are applied but may face challenges with domain knowledge", + "source_ids": [ + 14 + ], + "id": "Name: financial auditing\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "legal compliance", + "entity_type": "TASK_OR_PROBLEM", + "description": "legal compliance is a task where llms are applied but may face challenges with domain knowledge", + "source_ids": [ + 14 + ], + "id": "Name: legal compliance\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "scientific discovery", + "entity_type": "TASK_OR_PROBLEM", + "description": "scientific discovery is a task where llms are applied but may face challenges with domain knowledge", + "source_ids": [ + 14 + ], + "id": "Name: scientific discovery\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "llms", + "entity_type": "TECHNOLOGY", + "description": "LLMs are large language models used for high accuracy judgments in entity resolution, though they may lead to missing domain knowledge and generating outdated information; however, their hallucination can be mitigated by the naive RAG technique.", + "source_ids": [ + 33, + 66, + 14 + ], + "id": "Name: llms\nType: TECHNOLOGY" + }, + { + "entity_name": "retrieval augmented generation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Retrieval augmented generation, often abbreviated as RAG, is a method adopted to address the limitations of large language models by retrieving relevant domain knowledge. Additionally, it is the technique used by G Retriever for textual graph understanding.", + "source_ids": [ + 211, + 14 + ], + "id": "Name: retrieval augmented generation\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "rag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "RAG, an abbreviation for retrieval augmented generation, is a method or technique introduced in section 3 alongside problem formulation and IFT. It is designed to guide large language models (LLMs) during response generation and has been proven to excel in various tasks, including question answering and data cleaning.", + "source_ids": [ + 33, + 29, + 14 + ], + "id": "Name: rag\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "enterprise scenarios", + "entity_type": "LOCATION", + "description": "enterprise scenarios are real world contexts where domain knowledge is stored in long form documents", + "source_ids": [ + 14 + ], + "id": "Name: enterprise scenarios\nType: LOCATION" + }, + { + "entity_name": "technical handbooks", + "entity_type": "PRODUCT", + "description": "technical handbooks are long form documents where domain knowledge is often stored", + "source_ids": [ + 14 + ], + "id": "Name: technical handbooks\nType: PRODUCT" + }, + { + "entity_name": "api reference manuals", + "entity_type": "PRODUCT", + "description": "api reference manuals are long form documents where domain knowledge is often stored", + "source_ids": [ + 14 + ], + "id": "Name: api reference manuals\nType: PRODUCT" + }, + { + "entity_name": "operational guidebooks", + "entity_type": "PRODUCT", + "description": "operational guidebooks are long form documents where domain knowledge is often stored", + "source_ids": [ + 14 + ], + "id": "Name: operational guidebooks\nType: PRODUCT" + }, + { + "entity_name": "books", + "entity_type": "PRODUCT", + "description": "books are a structure followed by long form documents characterized by intricate layouts and logical hierarchies", + "source_ids": [ + 14 + ], + "id": "Name: books\nType: PRODUCT" + }, + { + "entity_name": "tables of contents", + "entity_type": "PRODUCT", + "description": "tables of contents are explicit structural elements found in long form documents", + "source_ids": [ + 14 + ], + "id": "Name: tables of contents\nType: PRODUCT" + }, + { + "entity_name": "nested chapters", + "entity_type": "PRODUCT", + "description": "nested chapters are structural elements found in long form documents", + "source_ids": [ + 14 + ], + "id": "Name: nested chapters\nType: PRODUCT" + }, + { + "entity_name": "multi level sections", + "entity_type": "PRODUCT", + "description": "multi level sections are structural elements found in long form documents", + "source_ids": [ + 14 + ], + "id": "Name: multi level sections\nType: PRODUCT" + }, + { + "entity_name": "rag system", + "entity_type": "SOFTWARE", + "description": "a rag system is designed in this paper for qa over long and highly structured documents", + "source_ids": [ + 14 + ], + "id": "Name: rag system\nType: SOFTWARE" + }, + { + "entity_name": "qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "QA refers to the question answering task, which is the specific problem the RAG system is designed for, the task for which official metrics are specified by each dataset, and the task being evaluated for performance in both the text and the figure.", + "source_ids": [ + 170, + 14, + 144, + 177, + 151 + ], + "id": "Name: qa\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "external sources", + "entity_type": "LOCATION", + "description": "external sources are referenced as the origin of relevant domain knowledge retrieved by rag", + "source_ids": [ + 14 + ], + "id": "Name: external sources\nType: LOCATION" + }, + { + "entity_name": "response generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "response generation is the process guided by rag to produce answers", + "source_ids": [ + 14 + ], + "id": "Name: response generation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "domain knowledge", + "entity_type": "CONCEPT", + "description": "domain knowledge is the specific information retrieved from external sources to guide llms", + "source_ids": [ + 14 + ], + "id": "Name: domain knowledge\nType: CONCEPT" + }, + { + "entity_name": "long form documents", + "entity_type": "PRODUCT", + "description": "long form documents are the type of storage for domain knowledge in enterprise scenarios", + "source_ids": [ + 14 + ], + "id": "Name: long form documents\nType: PRODUCT" + }, + { + "entity_name": "intricate layouts", + "entity_type": "SHAPE", + "description": "intricate layouts are a feature of the structure of long form documents", + "source_ids": [ + 14 + ], + "id": "Name: intricate layouts\nType: SHAPE" + }, + { + "entity_name": "logical hierarchies", + "entity_type": "CONCEPT", + "description": "logical hierarchies are a feature of the structure of long form documents", + "source_ids": [ + 14 + ], + "id": "Name: logical hierarchies\nType: CONCEPT" + }, + { + "entity_name": "this paper", + "entity_type": "BOOK", + "description": "this paper is the document where the authors aim to design an effective rag system", + "source_ids": [ + 14 + ], + "id": "Name: this paper\nType: BOOK" + }, + { + "entity_name": "rag", + "entity_type": "TECHNOLOGY", + "description": "rag refers to retrieval augmented generation approaches for document level qa mentioned in the text", + "source_ids": [ + 15 + ], + "id": "Name: rag\nType: TECHNOLOGY" + }, + { + "entity_name": "ocr", + "entity_type": "TECHNOLOGY", + "description": "ocr stands for optical character recognition a technology used to convert documents into plain text", + "source_ids": [ + 15 + ], + "id": "Name: ocr\nType: TECHNOLOGY" + }, + { + "entity_name": "graph based rag", + "entity_type": "TECHNOLOGY", + "description": "Graph based rag is a text-based retrieval-augmented generation method that extracts textual content from documents and leverages graph data as an external knowledge source during the retrieval process.", + "source_ids": [ + 147, + 15 + ], + "id": "Name: graph based rag\nType: TECHNOLOGY" + }, + { + "entity_name": "graphrag", + "entity_type": "PRODUCT", + "description": "graphrag is a representative method that constructs a knowledge graph from a textual corpus", + "source_ids": [ + 15 + ], + "id": "Name: graphrag\nType: PRODUCT" + }, + { + "entity_name": "raptor", + "entity_type": "PRODUCT", + "description": "raptor is a representative method that builds a recursive tree structure by clustering document chunks", + "source_ids": [ + 15 + ], + "id": "Name: raptor\nType: PRODUCT" + }, + { + "entity_name": "leiden community detection algorithm", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the leiden community detection algorithm is used by graphrag to obtain hierarchical clusters", + "source_ids": [ + 15 + ], + "id": "Name: leiden community detection algorithm\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "table 1", + "entity_type": "TABLE", + "description": "Table 1 is a section containing experimental results that compares representative methods, specifically GraphRAG and Raptor, alongside BookRAG.", + "source_ids": [ + 16, + 182, + 15 + ], + "id": "Name: table 1\nType: TABLE" + }, + { + "entity_name": "document level qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "document level qa is the specific task for which existing rag approaches are designed", + "source_ids": [ + 15 + ], + "id": "Name: document level qa\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "plain text", + "entity_type": "MATERIAL", + "description": "plain text is the output format produced by ocr when converting documents", + "source_ids": [ + 15 + ], + "id": "Name: plain text\nType: MATERIAL" + }, + { + "entity_name": "text based rag method", + "entity_type": "TECHNOLOGY", + "description": "text based rag methods are a category of approaches applied after ocr conversion", + "source_ids": [ + 15 + ], + "id": "Name: text based rag method\nType: TECHNOLOGY" + }, + { + "entity_name": "graph data", + "entity_type": "DATASET_OR_CORPUS", + "description": "graph data serves as an external knowledge source capturing semantic information and relational structures", + "source_ids": [ + 15 + ], + "id": "Name: graph data\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "knowledge graph", + "entity_type": "DATASET_OR_CORPUS", + "description": "A knowledge graph is a structured data repository constructed from a textual corpus by extracting entities and relations from document tree nodes.", + "source_ids": [ + 63, + 15 + ], + "id": "Name: knowledge graph\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "textual corpus", + "entity_type": "DATASET_OR_CORPUS", + "description": "a textual corpus is the source material from which graphrag constructs a knowledge graph", + "source_ids": [ + 15 + ], + "id": "Name: textual corpus\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "hierarchical clusters", + "entity_type": "TASK_OR_PROBLEM", + "description": "hierarchical clusters are the result of applying the leiden community detection algorithm", + "source_ids": [ + 15 + ], + "id": "Name: hierarchical clusters\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "summaries", + "entity_type": "PRODUCT", + "description": "summaries are generated for each community to provide a global overview of the corpus", + "source_ids": [ + 15 + ], + "id": "Name: summaries\nType: PRODUCT" + }, + { + "entity_name": "recursive tree structure", + "entity_type": "TASK_OR_PROBLEM", + "description": "a recursive tree structure is built by raptor through iterative clustering and summarization", + "source_ids": [ + 15 + ], + "id": "Name: recursive tree structure\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "document chunks", + "entity_type": "DATASET_OR_CORPUS", + "description": "document chunks are the units iteratively clustered by raptor", + "source_ids": [ + 15 + ], + "id": "Name: document chunks\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "fine grained semantic information", + "entity_type": "CONCEPT", + "description": "fine grained semantic information is a type of data captured by raptor across the corpus", + "source_ids": [ + 15 + ], + "id": "Name: fine grained semantic information\nType: CONCEPT" + }, + { + "entity_name": "high level semantic information", + "entity_type": "CONCEPT", + "description": "high level semantic information is a type of data captured by raptor across the corpus", + "source_ids": [ + 15 + ], + "id": "Name: high level semantic information\nType: CONCEPT" + }, + { + "entity_name": "representative methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "representative methods are the existing techniques being compared against bookrag in the text", + "source_ids": [ + 16 + ], + "id": "Name: representative methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "table: cref='#/texts/17'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/17'", + "source_ids": [ + 17 + ], + "id": "Name: table: cref='#/texts/17'...\nType: TABLE" + }, + { + "entity_name": "texts reference", + "entity_type": "SECTION_TITLE", + "description": "A reference identifier pointing to a specific text location within a document structure, indicated by the cref attribute '#/texts/17'.", + "source_ids": [ + 17 + ], + "id": "Name: texts reference\nType: SECTION_TITLE" + }, + { + "entity_name": "layout aware segmentation", + "entity_type": "TASK_OR_PROBLEM", + "description": "layout aware segmentation is a paradigm that parses documents into structured blocks to preserve original layout and information", + "source_ids": [ + 18 + ], + "id": "Name: layout aware segmentation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "docetl", + "entity_type": "SOFTWARE", + "description": "Docetl is a state-of-the-art, declarative system designed for complex document processing and information extraction tasks. It provides a user-friendly interface for defining LLM-based processing pipelines and introduces an agentic framework to optimize these workflows, serving as a comprehensive Document Extraction, Transformation, and Loading tool.", + "source_ids": [ + 32, + 18, + 148, + 159 + ], + "id": "Name: docetl\nType: SOFTWARE" + }, + { + "entity_name": "multimodal retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "multimodal retrieval is a technique applied to obtain relevant content from blocks with multimodal characteristics", + "source_ids": [ + 18 + ], + "id": "Name: multimodal retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "paragraphs", + "entity_type": "TASK_OR_PROBLEM", + "description": "paragraphs are structural blocks within a document preserved by layout aware segmentation", + "source_ids": [ + 18 + ], + "id": "Name: paragraphs\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "tables", + "entity_type": "TASK_OR_PROBLEM", + "description": "tables are structural blocks within a document preserved by layout aware segmentation", + "source_ids": [ + 18 + ], + "id": "Name: tables\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "figures", + "entity_type": "TASK_OR_PROBLEM", + "description": "figures are structural blocks within a document preserved by layout aware segmentation", + "source_ids": [ + 18 + ], + "id": "Name: figures\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "equations", + "entity_type": "TASK_OR_PROBLEM", + "description": "equations are structural blocks within a document preserved by layout aware segmentation", + "source_ids": [ + 18 + ], + "id": "Name: equations\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "first paradigm", + "entity_type": "TASK_OR_PROBLEM", + "description": "the first paradigm is a method that uses fixed chunk sizes often leading to fragmented information", + "source_ids": [ + 18 + ], + "id": "Name: first paradigm\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "second paradigm", + "entity_type": "TASK_OR_PROBLEM", + "description": "the second paradigm refers to layout aware segmentation which preserves document structure", + "source_ids": [ + 18 + ], + "id": "Name: second paradigm\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "declarative interface", + "entity_type": "SOFTWARE", + "description": "the declarative interface is a feature provided by docetl that allows users to define processing pipelines", + "source_ids": [ + 18 + ], + "id": "Name: declarative interface\nType: SOFTWARE" + }, + { + "entity_name": "processing pipelines", + "entity_type": "TASK_OR_PROBLEM", + "description": "processing pipelines are sequences of operations defined by users to analyze retrieved blocks", + "source_ids": [ + 18 + ], + "id": "Name: processing pipelines\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "llm based processing pipelines", + "entity_type": "TASK_OR_PROBLEM", + "description": "llm based processing pipelines are pipelines that utilize large language models for analysis", + "source_ids": [ + 18 + ], + "id": "Name: llm based processing pipelines\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "llm powered operations", + "entity_type": "TASK_OR_PROBLEM", + "description": "llm powered operations are the specific tasks combined within the processing pipelines", + "source_ids": [ + 18 + ], + "id": "Name: llm powered operations\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "task specific optimizations", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "task specific optimizations are enhancements applied to the pipelines for specific tasks", + "source_ids": [ + 18 + ], + "id": "Name: task specific optimizations\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "fixed chunk size", + "entity_type": "MEASUREMENT", + "description": "fixed chunk size is a parameter used in the first paradigm that can cause information fragmentation", + "source_ids": [ + 18 + ], + "id": "Name: fixed chunk size\nType: MEASUREMENT" + }, + { + "entity_name": "document native structural information", + "entity_type": "CONCEPT", + "description": "document native structural information is the data retained by layout aware segmentation", + "source_ids": [ + 18 + ], + "id": "Name: document native structural information\nType: CONCEPT" + }, + { + "entity_name": "relevant content", + "entity_type": "CONCEPT", + "description": "relevant content is the information obtained through multimodal retrieval to answer queries", + "source_ids": [ + 18 + ], + "id": "Name: relevant content\nType: CONCEPT" + }, + { + "entity_name": "queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "Queries are the diverse types of tasks that the system is designed to handle, serving as the questions or requests for which relevant content is retrieved. They act as the inputs that the agent-based retrieval approach dynamically classifies and for which the workflow is applied, either with or without planning.", + "source_ids": [ + 18, + 26, + 172, + 166 + ], + "id": "Name: queries\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "l1", + "entity_type": "TASK_OR_PROBLEM", + "description": "l1 is a limitation of existing works described as the failure to capture the deep connection of document structure and semantics", + "source_ids": [ + 19 + ], + "id": "Name: l1\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "l2", + "entity_type": "TASK_OR_PROBLEM", + "description": "l2 is a limitation of existing works described as the static nature of query workflows", + "source_ids": [ + 19 + ], + "id": "Name: l2\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "text based approaches", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "text based approaches are methods that cannot capture the structural layout of the document", + "source_ids": [ + 19 + ], + "id": "Name: text based approaches\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "layout segmented methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "layout segmented methods are approaches that preserve document structure but fail to capture relationships between different blocks", + "source_ids": [ + 19 + ], + "id": "Name: layout segmented methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "real world qa scenarios", + "entity_type": "EVENT", + "description": "real world qa scenarios are contexts where user queries are highly heterogeneous", + "source_ids": [ + 19 + ], + "id": "Name: real world qa scenarios\nType: EVENT" + }, + { + "entity_name": "static or manually predefined workflows", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "static or manually predefined workflows are uniform strategies applied to diverse query needs", + "source_ids": [ + 19 + ], + "id": "Name: static or manually predefined workflows\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "question decomposition", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "question decomposition is a method required for complex queries", + "source_ids": [ + 19 + ], + "id": "Name: question decomposition\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "document", + "entity_type": "PRODUCT", + "description": "A document is the object whose structure and semantics are being analyzed, represented as a sequence of pages containing content blocks organized within a logical chapter hierarchy. It serves as the source material containing logical hierarchy, entities, and relations, and is the complex object within which the bookindex captures these logical structures and entity relations. Additionally, a document is the object being organized into a hierarchical structure by the tree component.", + "source_ids": [ + 37, + 47, + 19, + 51, + 52 + ], + "id": "Name: document\nType: PRODUCT" + }, + { + "entity_name": "tables", + "entity_type": "TABLE", + "description": "Tables are document elements that serve as examples of hierarchical blocks nested within specific sections of a document and function as nodes within the document's explicit logical hierarchy. They are specific PDF blocks labeled to establish ground truth and act as the source from which large language models generate global questions.", + "source_ids": [ + 144, + 51, + 19, + 141 + ], + "id": "Name: tables\nType: TABLE" + }, + { + "entity_name": "section", + "entity_type": "SECTION_TITLE", + "description": "A section is a structural part of a document, often used as a filter type for components like chapters, where tables may be nested and where the formalization and review described in the text are contained.", + "source_ids": [ + 19, + 258, + 35 + ], + "id": "Name: section\nType: SECTION_TITLE" + }, + { + "entity_name": "user queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "User queries are inputs in real-world QA scenarios that range from simple to complex, serving as the input items that an agent classifies based on intent and complexity.", + "source_ids": [ + 19, + 22 + ], + "id": "Name: user queries\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "keyword lookups", + "entity_type": "TASK_OR_PROBLEM", + "description": "keyword lookups are simple types of user queries mentioned in the text", + "source_ids": [ + 19 + ], + "id": "Name: keyword lookups\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "multi hop questions", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi hop questions are complex queries requiring evidence synthesis across different document parts", + "source_ids": [ + 19 + ], + "id": "Name: multi hop questions\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "evidence", + "entity_type": "CONCEPT", + "description": "Evidence refers to information scattered across different parts of a document that is needed for multi-hop reasoning, as well as the highly relevant information located by the reasoner.", + "source_ids": [ + 19, + 22 + ], + "id": "Name: evidence\nType: CONCEPT" + }, + { + "entity_name": "hierarchical blocks", + "entity_type": "CONCEPT", + "description": "hierarchical blocks are structural elements of a document containing relationships", + "source_ids": [ + 19 + ], + "id": "Name: hierarchical blocks\nType: CONCEPT" + }, + { + "entity_name": "multi hop reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "Multi hop reasoning is a task that relies on a high quality knowledge graph, yet its capability is limited by methods that cannot capture relationships between document blocks.", + "source_ids": [ + 19, + 21 + ], + "id": "Name: multi hop reasoning\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "overall performance", + "entity_type": "EVALUATION_METRIC", + "description": "overall performance is the metric affected by the limitations of existing methods", + "source_ids": [ + 19 + ], + "id": "Name: overall performance\nType: EVALUATION_METRIC" + }, + { + "entity_name": "complex queries", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 19 + ], + "id": "Name: complex queries\nType: UNKNOWN" + }, + { + "entity_name": "simple queries", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 19 + ], + "id": "Name: simple queries\nType: UNKNOWN" + }, + { + "entity_name": "bookrag", + "entity_type": "TECHNOLOGY", + "description": "bookrag is a retrieval augmented generation method introduced to bridge a gap in document qa tasks", + "source_ids": [ + 20 + ], + "id": "Name: bookrag\nType: TECHNOLOGY" + }, + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "Bookindex is a document-native structure and product developed by BookRag to organize information within complex documents using hierarchical and graph-based methods. Defined as a formally specified triplet structure composed of a tree (t), a graph (g), and metadata (m), it serves as the foundation for an agent-based retrieval approach by capturing logical hierarchy and entity relations. Presented in section 4, this system illustrates a final output data structure that combines organized tree and graph representations, where the graph links help complete the overall structure. As a proposed hierarchical structure-aware index, bookindex acts as the source of content ranges that selector operators filter, with specific operators designed to manipulate the tuple (t, g, m) for effective information retrieval.", + "source_ids": [ + 97, + 102, + 77, + 47, + 49, + 51, + 20, + 52, + 22, + 25, + 29 + ], + "id": "Name: bookindex\nType: PRODUCT" + }, + { + "entity_name": "document qa tasks", + "entity_type": "TASK_OR_PROBLEM", + "description": "Document QA tasks are the specific problems that BookRag and BookIndex are designed to address, serving as the benchmark on which the efficiency and accuracy of BookRag and baseline methods are compared.", + "source_ids": [ + 137, + 20 + ], + "id": "Name: document qa tasks\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "hierarchical tree structure", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the hierarchical tree structure is a method used to preserve the document s native logical hierarchy by organizing parsed content blocks", + "source_ids": [ + 20 + ], + "id": "Name: hierarchical tree structure\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "kg", + "entity_type": "TECHNOLOGY", + "description": "kg refers to a knowledge graph constructed to capture intricate relations within document blocks", + "source_ids": [ + 20 + ], + "id": "Name: kg\nType: TECHNOLOGY" + }, + { + "entity_name": "table of contents", + "entity_type": "PRODUCT", + "description": "the table of contents is the role served by the hierarchical tree structure in organizing the document s logical hierarchy", + "source_ids": [ + 20 + ], + "id": "Name: table of contents\nType: PRODUCT" + }, + { + "entity_name": "parsed content blocks", + "entity_type": "MATERIAL", + "description": "parsed content blocks are the units of document content organized into a hierarchical tree structure", + "source_ids": [ + 20 + ], + "id": "Name: parsed content blocks\nType: MATERIAL" + }, + { + "entity_name": "fine grained entities", + "entity_type": "DATASET_OR_CORPUS", + "description": "fine grained entities are the specific data points contained within the document blocks that are captured by the knowledge graph", + "source_ids": [ + 20 + ], + "id": "Name: fine grained entities\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "relation", + "entity_type": "CONCEPT", + "description": "the relation refers to the deep connections within the document that the method aims to capture", + "source_ids": [ + 20 + ], + "id": "Name: relation\nType: CONCEPT" + }, + { + "entity_name": "tree nodes", + "entity_type": "PRODUCT", + "description": "tree nodes are the specific components of the hierarchical tree structure to which kg entities are mapped", + "source_ids": [ + 20 + ], + "id": "Name: tree nodes\nType: PRODUCT" + }, + { + "entity_name": "kg", + "entity_type": "CONCEPT", + "description": "kg refers to a knowledge graph which is a data structure used for multi hop reasoning", + "source_ids": [ + 21 + ], + "id": "Name: kg\nType: CONCEPT" + }, + { + "entity_name": "llm", + "entity_type": "PRODUCT", + "description": "llm is an example of a distinct entity name mentioned in the context of entity ambiguity", + "source_ids": [ + 21 + ], + "id": "Name: llm\nType: PRODUCT" + }, + { + "entity_name": "large language model", + "entity_type": "PRODUCT", + "description": "large language model is an example of a distinct entity name mentioned in the context of entity ambiguity", + "source_ids": [ + 21 + ], + "id": "Name: large language model\nType: PRODUCT" + }, + { + "entity_name": "gradient based entity resolution method", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The gradient based entity resolution method is a novel approach proposed to address entity ambiguity by analyzing similarity distributions and is used to refine entity knowledge during graph construction.", + "source_ids": [ + 21, + 47 + ], + "id": "Name: gradient based entity resolution method\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "entity ambiguity", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity ambiguity is a problem where distinct entities share similar names compromising the knowledge graph", + "source_ids": [ + 21 + ], + "id": "Name: entity ambiguity\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "similarity distribution", + "entity_type": "CONCEPT", + "description": "similarity distribution is the data pattern analyzed by the proposed method to identify sharp drops in scores", + "source_ids": [ + 21 + ], + "id": "Name: similarity distribution\nType: CONCEPT" + }, + { + "entity_name": "candidate entities", + "entity_type": "CONCEPT", + "description": "candidate entities are the potential matches analyzed to distinguish and merge coreferent entities", + "source_ids": [ + 21 + ], + "id": "Name: candidate entities\nType: CONCEPT" + }, + { + "entity_name": "coreferent entities", + "entity_type": "CONCEPT", + "description": "coreferent entities are distinct entities that refer to the same real world object and need to be merged", + "source_ids": [ + 21 + ], + "id": "Name: coreferent entities\nType: CONCEPT" + }, + { + "entity_name": "graph connectivity", + "entity_type": "CONCEPT", + "description": "graph connectivity is a property of the knowledge graph that is ensured by the proposed method", + "source_ids": [ + 21 + ], + "id": "Name: graph connectivity\nType: CONCEPT" + }, + { + "entity_name": "reasoning capabilities", + "entity_type": "CONCEPT", + "description": "reasoning capabilities are the skills of the system that are enhanced by the proposed method", + "source_ids": [ + 21 + ], + "id": "Name: reasoning capabilities\nType: CONCEPT" + }, + { + "entity_name": "selector", + "entity_type": "SOFTWARE", + "description": "The selector is a software component and operator within BookRag designed to narrow the search space by utilizing information scents, effectively navigating to and refining the search to a precise information patch.", + "source_ids": [ + 124, + 157, + 22 + ], + "id": "Name: selector\nType: SOFTWARE" + }, + { + "entity_name": "reasoner", + "entity_type": "SOFTWARE", + "description": "The reasoner is a software component within BookRag that functions as an operator to perform sensemaking within information patches. It locates highly relevant evidence and conducts analysis on the selected information to support its operations.", + "source_ids": [ + 124, + 157, + 22 + ], + "id": "Name: reasoner\nType: SOFTWARE" + }, + { + "entity_name": "retrieval workflows", + "entity_type": "TASK_OR_PROBLEM", + "description": "Retrieval workflows are the static processes being addressed and dynamically generated by the agent, as well as the processes configured by the approach to locate evidence.", + "source_ids": [ + 26, + 22 + ], + "id": "Name: retrieval workflows\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "search space", + "entity_type": "TASK_OR_PROBLEM", + "description": "the search space is the area narrowed down by the selector component", + "source_ids": [ + 22 + ], + "id": "Name: search space\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "information scents", + "entity_type": "CONCEPT", + "description": "Information scents are signals used by a selector to narrow down the search space, described as cues such as key entities in a question followed by selector operators.", + "source_ids": [ + 125, + 22 + ], + "id": "Name: information scents\nType: CONCEPT" + }, + { + "entity_name": "agent", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 22 + ], + "id": "Name: agent\nType: UNKNOWN" + }, + { + "entity_name": "kg", + "entity_type": "PRODUCT", + "description": "kg refers to a high quality knowledge graph identified as a key feature contributing to the system s performance", + "source_ids": [ + 23 + ], + "id": "Name: kg\nType: PRODUCT" + }, + { + "entity_name": "agent based retrieval mechanism", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the agent based retrieval mechanism is a key feature of the system validated for its critical contributions", + "source_ids": [ + 23 + ], + "id": "Name: agent based retrieval mechanism\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "three widely adopted datasets", + "entity_type": "DATASET_OR_CORPUS", + "description": "three widely adopted datasets are the data sources used to conduct extensive experiments and validate the system", + "source_ids": [ + 23 + ], + "id": "Name: three widely adopted datasets\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "state of the art baselines", + "entity_type": "PRODUCT", + "description": "State of the art baselines are the existing systems used for comparison in the evaluation of Bookrag, serving as the reference against which Bookrag is compared in the experiments.", + "source_ids": [ + 151, + 23 + ], + "id": "Name: state of the art baselines\nType: PRODUCT" + }, + { + "entity_name": "our contributions", + "entity_type": "TASK_OR_PROBLEM", + "description": "our contributions refers to the summary of work or achievements presented in the text", + "source_ids": [ + 24 + ], + "id": "Name: our contributions\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "hierarchical tree", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "a hierarchical tree of document layout blocks is integrated by bookrag to construct the bookindex", + "source_ids": [ + 25 + ], + "id": "Name: hierarchical tree\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "kg", + "entity_type": "SOFTWARE", + "description": "kg is a knowledge graph that stores fine-grained entity relations and is used in the BookRAG method. It serves as a possible form of structured index during the offline indexing phase, where entities are linked together to form a cohesive network of information.", + "source_ids": [ + 25, + 98, + 45 + ], + "id": "Name: kg\nType: SOFTWARE" + }, + { + "entity_name": "document layout blocks", + "entity_type": "MATERIAL", + "description": "document layout blocks are the structural components of a document that are organized into a hierarchical tree", + "source_ids": [ + 25 + ], + "id": "Name: document layout blocks\nType: MATERIAL" + }, + { + "entity_name": "entity relations", + "entity_type": "CONCEPT", + "description": "Entity relations are the fine-grained connections between entities stored within a knowledge graph and also represent the intricate connections within complex documents that the bookindex is designed to capture.", + "source_ids": [ + 25, + 47 + ], + "id": "Name: entity relations\nType: CONCEPT" + }, + { + "entity_name": "agent based retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "Agent based retrieval is a proposed approach and workflow designed to address users' queries systematically by dynamically classifying queries and configuring retrieval workflows.", + "source_ids": [ + 81, + 26 + ], + "id": "Name: agent based retrieval\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "documents", + "entity_type": "DATASET_OR_CORPUS", + "description": "documents are the source material within which highly relevant evidence is located", + "source_ids": [ + 26 + ], + "id": "Name: documents\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "evidence", + "entity_type": "TASK_OR_PROBLEM", + "description": "evidence refers to the highly relevant information sought within the documents", + "source_ids": [ + 26 + ], + "id": "Name: evidence\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "existing baselines", + "entity_type": "PRODUCT", + "description": "Existing baselines are the current methods or systems that BookRag outperforms in experiments and performance, characterized by their susceptibility to context fragmentation and reliance on static query workflows.", + "source_ids": [ + 152, + 27, + 188 + ], + "id": "Name: existing baselines\nType: PRODUCT" + }, + { + "entity_name": "complex document qa tasks", + "entity_type": "TASK_OR_PROBLEM", + "description": "Complex document QA tasks are the specific problems that BookRag is designed to solve, representing the general category of problems addressed by the three benchmarks and the specific problems being solved by the methods in the comparison.", + "source_ids": [ + 153, + 27, + 141 + ], + "id": "Name: complex document qa tasks\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "extensive experiments", + "entity_type": "EVENT", + "description": "extensive experiments are the tests conducted to evaluate the performance of bookrag", + "source_ids": [ + 27 + ], + "id": "Name: extensive experiments\nType: EVENT" + }, + { + "entity_name": "multiple benchmarks", + "entity_type": "BENCHMARK", + "description": "multiple benchmarks are the evaluation standards used in the experiments to measure performance", + "source_ids": [ + 27 + ], + "id": "Name: multiple benchmarks\nType: BENCHMARK" + }, + { + "entity_name": "state of the art performance", + "entity_type": "EVALUATION_METRIC", + "description": "state of the art performance is the high level of achievement attained by bookrag in the tasks", + "source_ids": [ + 27 + ], + "id": "Name: state of the art performance\nType: EVALUATION_METRIC" + }, + { + "entity_name": "competitive efficiency", + "entity_type": "EVALUATION_METRIC", + "description": "competitive efficiency is a metric indicating that bookrag maintains good efficiency while performing well", + "source_ids": [ + 27 + ], + "id": "Name: competitive efficiency\nType: EVALUATION_METRIC" + }, + { + "entity_name": "2", + "entity_type": "NUMBER", + "description": "2 is a numerical value appearing in the text though its specific context or meaning is not defined", + "source_ids": [ + 28 + ], + "id": "Name: 2\nType: NUMBER" + }, + { + "entity_name": "section 2", + "entity_type": "SECTION_TITLE", + "description": "section 2 is the part of the text where related work is reviewed", + "source_ids": [ + 29 + ], + "id": "Name: section 2\nType: SECTION_TITLE" + }, + { + "entity_name": "section 3", + "entity_type": "SECTION_TITLE", + "description": "section 3 introduces the problem formulation ift and rag workflow", + "source_ids": [ + 29 + ], + "id": "Name: section 3\nType: SECTION_TITLE" + }, + { + "entity_name": "ift", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "IFT is a method or technique introduced in section 3 alongside problem formulation and the RAG workflow. It serves as a principle within the structured execution mechanism of BookRAG, ensuring that the execution aligns with its intended design.", + "source_ids": [ + 125, + 29, + 79 + ], + "id": "Name: ift\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "section 4", + "entity_type": "SECTION_TITLE", + "description": "section 4 presents the structure of bookindex and its construction", + "source_ids": [ + 29 + ], + "id": "Name: section 4\nType: SECTION_TITLE" + }, + { + "entity_name": "section 6", + "entity_type": "SECTION_TITLE", + "description": "section 6 presents experimental results and detailed analysis", + "source_ids": [ + 29 + ], + "id": "Name: section 6\nType: SECTION_TITLE" + }, + { + "entity_name": "section 7", + "entity_type": "SECTION_TITLE", + "description": "section 7 is where the paper concludes", + "source_ids": [ + 29 + ], + "id": "Name: section 7\nType: SECTION_TITLE" + }, + { + "entity_name": "section 5", + "entity_type": "SECTION_TITLE", + "description": "section 5 is the part of the text where agent based retrieval is presented", + "source_ids": [ + 29 + ], + "id": "Name: section 5\nType: SECTION_TITLE" + }, + { + "entity_name": "query classification", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Query classification is a component of the agent-based retrieval elaborated in section 5 and serves as a step within the classification plan stage that categorizes queries.", + "source_ids": [ + 82, + 29 + ], + "id": "Name: query classification\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "operators", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "operators are used in the structured execution of bookrag as described in section 5", + "source_ids": [ + 29 + ], + "id": "Name: operators\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "structured execution", + "entity_type": "TASK_OR_PROBLEM", + "description": "structured execution refers to the process in bookrag that utilizes query classification and operators", + "source_ids": [ + 29 + ], + "id": "Name: structured execution\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "related work", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 29 + ], + "id": "Name: related work\nType: UNKNOWN" + }, + { + "entity_name": "experimental results", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 29 + ], + "id": "Name: experimental results\nType: UNKNOWN" + }, + { + "entity_name": "conclusion", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 29 + ], + "id": "Name: conclusion\nType: UNKNOWN" + }, + { + "entity_name": "2 related work", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the main paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section provides a comprehensive review of existing literature, specifically focusing on Retrieval-Augmented Generation (RAG) methods and their limitations regarding hierarchical document structures.", + "source_ids": [ + 30 + ], + "id": "Name: 2 related work\nType: SECTION_TITLE" + }, + { + "entity_name": "retrieval-augmented generation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Refers to the class of techniques discussed in section 2 that enhance Large Language Models by querying external information, serving as the primary context for the related work analysis.", + "source_ids": [ + 30 + ], + "id": "Name: retrieval-augmented generation\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "hierarchical document structures", + "entity_type": "TASK_OR_PROBLEM", + "description": "Refers to the specific structural characteristics of documents (e.g., books, handbooks) that existing RAG approaches often overlook, which is a key problem addressed in the literature review within section 2.", + "source_ids": [ + 30 + ], + "id": "Name: hierarchical document structures\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "llm", + "entity_type": "TECHNOLOGY", + "description": "LLM, which stands for large language model, is a technology primarily used in document analysis for robust semantic reasoning and finer-grained distinctions when identifying multiple similar entities. It functions as an operator within the formulator category and serves as a system to select specific sections in described processes. Additionally, LLM is utilized by treetraverse to navigate a document's tree structure and is noted for its generation nature, which is described as uncontrollable. The term is also recognized as an abbreviation for large language model in the context of entity name matching.", + "source_ids": [ + 32, + 98, + 227, + 104, + 74, + 267, + 148, + 31 + ], + "id": "Name: llm\nType: TECHNOLOGY" + }, + { + "entity_name": "rag approaches", + "entity_type": "TECHNOLOGY", + "description": "rag approaches are modern representative technologies reviewed in the text", + "source_ids": [ + 31 + ], + "id": "Name: rag approaches\nType: TECHNOLOGY" + }, + { + "entity_name": "document analysis", + "entity_type": "RESEARCH_FIELD", + "description": "document analysis is the field of study where llms and rag approaches are applied", + "source_ids": [ + 31 + ], + "id": "Name: document analysis\nType: RESEARCH_FIELD" + }, + { + "entity_name": "related works", + "entity_type": "SECTION_TITLE", + "description": "related works is the section of the text where the review of llm and rag approaches takes place", + "source_ids": [ + 31 + ], + "id": "Name: related works\nType: SECTION_TITLE" + }, + { + "entity_name": "html", + "entity_type": "FILE_TYPE", + "description": "html is an unstructured document format mentioned as a target for conversion into structured formats", + "source_ids": [ + 32 + ], + "id": "Name: html\nType: FILE_TYPE" + }, + { + "entity_name": "pdf", + "entity_type": "FILE_TYPE", + "description": "PDF is an unstructured document format that is often mentioned as a target for conversion into structured formats and is also referenced in the context of parsing failures.", + "source_ids": [ + 32, + 236 + ], + "id": "Name: pdf\nType: FILE_TYPE" + }, + { + "entity_name": "raw text", + "entity_type": "FILE_TYPE", + "description": "raw text is an unstructured document format mentioned as a target for conversion into structured formats", + "source_ids": [ + 32 + ], + "id": "Name: raw text\nType: FILE_TYPE" + }, + { + "entity_name": "relational tables", + "entity_type": "PRODUCT", + "description": "relational tables are structured formats that unstructured documents are converted into", + "source_ids": [ + 32 + ], + "id": "Name: relational tables\nType: PRODUCT" + }, + { + "entity_name": "evaporate", + "entity_type": "SOFTWARE", + "description": "evaporate is a system that utilizes llms to synthesize extraction code for converting semi structured web documents", + "source_ids": [ + 32 + ], + "id": "Name: evaporate\nType: SOFTWARE" + }, + { + "entity_name": "lotus", + "entity_type": "SOFTWARE", + "description": "lotus is a system that extends the relational model with semantic operators for querying unstructured text corpora", + "source_ids": [ + 32 + ], + "id": "Name: lotus\nType: SOFTWARE" + }, + { + "entity_name": "sql", + "entity_type": "PROGRAMMING_LANGUAGE", + "description": "sql is a query language referenced in the context of sql like queries executed by lotus", + "source_ids": [ + 32 + ], + "id": "Name: sql\nType: PROGRAMMING_LANGUAGE" + }, + { + "entity_name": "web documents", + "entity_type": "PRODUCT", + "description": "web documents are semi structured documents processed by systems like evaporate", + "source_ids": [ + 32 + ], + "id": "Name: web documents\nType: PRODUCT" + }, + { + "entity_name": "document pages", + "entity_type": "IMAGE", + "description": "document pages are viewed as images in research to preserve layout and visual information", + "source_ids": [ + 32 + ], + "id": "Name: document pages\nType: IMAGE" + }, + { + "entity_name": "semantic operators", + "entity_type": "TECHNOLOGY", + "description": "semantic operators are features added by lotus to extend the relational model", + "source_ids": [ + 32 + ], + "id": "Name: semantic operators\nType: TECHNOLOGY" + }, + { + "entity_name": "predicates", + "entity_type": "TECHNOLOGY", + "description": "predicates are llm powered functions like filter and join used in lotus", + "source_ids": [ + 32 + ], + "id": "Name: predicates\nType: TECHNOLOGY" + }, + { + "entity_name": "filter", + "entity_type": "TECHNOLOGY", + "description": "filter is an example of an llm powered predicate used in lotus", + "source_ids": [ + 32 + ], + "id": "Name: filter\nType: TECHNOLOGY" + }, + { + "entity_name": "join", + "entity_type": "TECHNOLOGY", + "description": "join is an example of an llm powered predicate used in lotus", + "source_ids": [ + 32 + ], + "id": "Name: join\nType: TECHNOLOGY" + }, + { + "entity_name": "agentic framework", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "an agentic framework is introduced by docetl to optimize information extraction", + "source_ids": [ + 32 + ], + "id": "Name: agentic framework\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "information extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "information extraction is the complex task optimized by docetl", + "source_ids": [ + 32 + ], + "id": "Name: information extraction\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "layout", + "entity_type": "CONCEPT", + "description": "layout refers to the visual structure of documents preserved when viewing pages as images", + "source_ids": [ + 32 + ], + "id": "Name: layout\nType: CONCEPT" + }, + { + "entity_name": "visual information", + "entity_type": "CONCEPT", + "description": "visual information refers to the content preserved when document pages are viewed as images", + "source_ids": [ + 32 + ], + "id": "Name: visual information\nType: CONCEPT" + }, + { + "entity_name": "semi structured web documents", + "entity_type": "PRODUCT", + "description": "semi structured web documents are the input type for evaporate", + "source_ids": [ + 32 + ], + "id": "Name: semi structured web documents\nType: PRODUCT" + }, + { + "entity_name": "structured databases", + "entity_type": "PRODUCT", + "description": "structured databases are the output format produced by evaporate", + "source_ids": [ + 32 + ], + "id": "Name: structured databases\nType: PRODUCT" + }, + { + "entity_name": "manual annotation", + "entity_type": "TASK_OR_PROBLEM", + "description": "manual annotation is a heavy process avoided by evaporate s cost effective conversion", + "source_ids": [ + 32 + ], + "id": "Name: manual annotation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "unstructured text corpora", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 32 + ], + "id": "Name: unstructured text corpora\nType: UNKNOWN" + }, + { + "entity_name": "rag approaches", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "rag approaches are methods proven to excel in tasks like question answering and data cleaning", + "source_ids": [ + 33 + ], + "id": "Name: rag approaches\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "open ended question answering", + "entity_type": "TASK_OR_PROBLEM", + "description": "open ended question answering is a task where rag approaches have been proven to excel", + "source_ids": [ + 33 + ], + "id": "Name: open ended question answering\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "programming context", + "entity_type": "TASK_OR_PROBLEM", + "description": "programming context is a task where rag approaches have been proven to excel", + "source_ids": [ + 33 + ], + "id": "Name: programming context\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "sql rewrite", + "entity_type": "TASK_OR_PROBLEM", + "description": "sql rewrite is a task where rag approaches have been proven to excel", + "source_ids": [ + 33 + ], + "id": "Name: sql rewrite\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "data cleaning", + "entity_type": "TASK_OR_PROBLEM", + "description": "data cleaning is a task where rag approaches have been proven to excel", + "source_ids": [ + 33 + ], + "id": "Name: data cleaning\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "naive rag technique", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the naive rag technique relies on retrieving query relevant contexts from external knowledge bases to mitigate hallucination", + "source_ids": [ + 33 + ], + "id": "Name: naive rag technique\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "graph structures", + "entity_type": "TECHNOLOGY", + "description": "graph structures are adopted by many rag approaches to organize information and relationships within documents", + "source_ids": [ + 33 + ], + "id": "Name: graph structures\nType: TECHNOLOGY" + }, + { + "entity_name": "agentic rag paradigm", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the agentic rag paradigm employs autonomous agents to dynamically orchestrate and refine the rag pipeline", + "source_ids": [ + 33 + ], + "id": "Name: agentic rag paradigm\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "autonomous agents", + "entity_type": "TECHNOLOGY", + "description": "autonomous agents are employed by the agentic rag paradigm to orchestrate and refine the pipeline", + "source_ids": [ + 33 + ], + "id": "Name: autonomous agents\nType: TECHNOLOGY" + }, + { + "entity_name": "rag pipeline", + "entity_type": "TASK_OR_PROBLEM", + "description": "the rag pipeline is the process dynamically orchestrated and refined by the agentic rag paradigm", + "source_ids": [ + 33 + ], + "id": "Name: rag pipeline\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "recent survey of graph based rag methods", + "entity_type": "PUBLICATION_VENUE", + "description": "a recent survey of graph based rag methods is referenced for more details on the topic", + "source_ids": [ + 33 + ], + "id": "Name: recent survey of graph based rag methods\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "external knowledge bases", + "entity_type": "TECHNOLOGY", + "description": "external knowledge bases are sources from which the naive rag technique retrieves query relevant contexts", + "source_ids": [ + 33 + ], + "id": "Name: external knowledge bases\nType: TECHNOLOGY" + }, + { + "entity_name": "hallucination", + "entity_type": "TASK_OR_PROBLEM", + "description": "hallucination is a problem in llms that the naive rag technique aims to mitigate", + "source_ids": [ + 33 + ], + "id": "Name: hallucination\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "overall retrieval performance", + "entity_type": "EVALUATION_METRIC", + "description": "overall retrieval performance is improved by rag approaches that adopt graph structures", + "source_ids": [ + 33 + ], + "id": "Name: overall retrieval performance\nType: EVALUATION_METRIC" + }, + { + "entity_name": "reasoning robustness", + "entity_type": "EVALUATION_METRIC", + "description": "reasoning robustness is a metric significantly boosted by the agentic rag paradigm", + "source_ids": [ + 33 + ], + "id": "Name: reasoning robustness\nType: EVALUATION_METRIC" + }, + { + "entity_name": "generation fidelity", + "entity_type": "EVALUATION_METRIC", + "description": "generation fidelity is a metric significantly boosted by the agentic rag paradigm", + "source_ids": [ + 33 + ], + "id": "Name: generation fidelity\nType: EVALUATION_METRIC" + }, + { + "entity_name": "documents", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 33 + ], + "id": "Name: documents\nType: UNKNOWN" + }, + { + "entity_name": "3 preliminaries", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the main paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section establishes the foundational concepts, definitions, and theoretical background necessary to understand the proposed BookRAG method and its context within Retrieval-Augmented Generation (RAG) for hierarchical documents.", + "source_ids": [ + 34 + ], + "id": "Name: 3 preliminaries\nType: SECTION_TITLE" + }, + { + "entity_name": "ift", + "entity_type": "SCIENTIFIC_THEORY", + "description": "ift is an abbreviation for information foraging theory a foundational theory introduced in the text", + "source_ids": [ + 35 + ], + "id": "Name: ift\nType: SCIENTIFIC_THEORY" + }, + { + "entity_name": "rag systems", + "entity_type": "TECHNOLOGY", + "description": "Rag systems are a type of technology known as Retrieval-Augmented Generation systems, and their general workflow is reviewed in the text, with detailed analysis provided in section 3.3.", + "source_ids": [ + 35, + 44 + ], + "id": "Name: rag systems\nType: TECHNOLOGY" + }, + { + "entity_name": "research problem", + "entity_type": "TASK_OR_PROBLEM", + "description": "the research problem is the subject being formalized in the text", + "source_ids": [ + 35 + ], + "id": "Name: research problem\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "general workflow", + "entity_type": "TASK_OR_PROBLEM", + "description": "the general workflow of rag systems is the subject being briefly reviewed in the text", + "source_ids": [ + 35 + ], + "id": "Name: general workflow\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "3.1 problem formulation", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Preliminaries' within the BookRAG paper, this section formalizes the research problem of complex document Question Answering (QA) and establishes the foundational context for the proposed approach.", + "source_ids": [ + 36 + ], + "id": "Name: 3.1 problem formulation\nType: SECTION_TITLE" + }, + { + "entity_name": "user query", + "entity_type": "TASK_OR_PROBLEM", + "description": "A user query is an input provided to a system to generate an accurate answer, serving as the input used in the online retrieval phase to retrieve relevant components. It represents a task or problem mentioned in the text that signifies a request for information or action, and in real data sections, it is the final input provided, often explicitly represented by the word \"query\" itself.", + "source_ids": [ + 45, + 252, + 37, + 255 + ], + "id": "Name: user query\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "answer", + "entity_type": "TASK_OR_PROBLEM", + "description": "An answer is the final output generated by the system, typically synthesized by an agent using the reduce method and ideally grounded in specific evidence blocks, and is symbolized by a lightbulb icon.", + "source_ids": [ + 84, + 124, + 37, + 135 + ], + "id": "Name: answer\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "evidence blocks", + "entity_type": "DATASET_OR_CORPUS", + "description": "evidence blocks are a specific set of content blocks from the document used to ground the generated answer", + "source_ids": [ + 37 + ], + "id": "Name: evidence blocks\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "method s", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "method s is a developed approach that maps a structured document and a query to a final answer", + "source_ids": [ + 37 + ], + "id": "Name: method s\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "pages", + "entity_type": "MEASUREMENT", + "description": "pages are the units that collectively form a document represented as a sequence", + "source_ids": [ + 37 + ], + "id": "Name: pages\nType: MEASUREMENT" + }, + { + "entity_name": "content blocks", + "entity_type": "DATASET_OR_CORPUS", + "description": "Content blocks are distinct elements within a document, such as text segments, section headers, tables, or images. These diverse structural units are identified and extracted from document pages as described in section 4.2.1.", + "source_ids": [ + 37, + 55 + ], + "id": "Name: content blocks\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "text segment", + "entity_type": "DATASET_OR_CORPUS", + "description": "a text segment is a type of content block within a document", + "source_ids": [ + 37 + ], + "id": "Name: text segment\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "section header", + "entity_type": "DATASET_OR_CORPUS", + "description": "a section header is a type of content block within a document", + "source_ids": [ + 37 + ], + "id": "Name: section header\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "table", + "entity_type": "DATASET_OR_CORPUS", + "description": "a table is a type of content block within a document", + "source_ids": [ + 37 + ], + "id": "Name: table\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "image", + "entity_type": "DATASET_OR_CORPUS", + "description": "an image is a type of content block within a document", + "source_ids": [ + 37 + ], + "id": "Name: image\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "logical chapter hierarchy", + "entity_type": "TASK_OR_PROBLEM", + "description": "a logical chapter hierarchy is the organizational structure within which content blocks are arranged", + "source_ids": [ + 37 + ], + "id": "Name: logical chapter hierarchy\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The variable n represents different concepts depending on the context: it can denote the number of pages in a document, the set of nodes in a general tree structure, the set of nodes in a specific origin tree t, or the set of nodes in a tree. Additionally, n may refer to refined evidence utilized by a synthesizer operator.", + "source_ids": [ + 129, + 37, + 102, + 77, + 51 + ], + "id": "Name: n\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "m", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "m is a variable representing the number of content blocks in a document and serves as a component of the bookindex structure and the bookindex data structure.", + "source_ids": [ + 88, + 37, + 85 + ], + "id": "Name: m\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "p", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The variable p serves multiple distinct roles depending on the context: it represents a specific page within a document sequence, denotes the power set of nodes in a tree structure used for graph tree link definitions, and signifies precision calculated as the intersection of extracted and ground truth tokens divided by the extracted tokens.", + "source_ids": [ + 51, + 37, + 231 + ], + "id": "Name: p\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "q", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "q is a variable representing the original user query mentioned in the text, which serves as the query or input used by the synthesizer operator.", + "source_ids": [ + 129, + 37, + 101 + ], + "id": "Name: q\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "a", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "a is a variable representing the coherent answer generated by the synthesizer operator.", + "source_ids": [ + 129, + 37 + ], + "id": "Name: a\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "e", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "e is a variable representing a subset of evidence blocks", + "source_ids": [ + 37 + ], + "id": "Name: e\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "b", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "b is a variable representing the sequence of all content blocks", + "source_ids": [ + 37 + ], + "id": "Name: b\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "d", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "d is a variable representing the document", + "source_ids": [ + 37 + ], + "id": "Name: d\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "equation 1", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 1 is the mathematical formulation defining the task as a s d q", + "source_ids": [ + 37 + ], + "id": "Name: equation 1\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "references 5 11 33", + "entity_type": "PUBLICATION_VENUE", + "description": "references 5 11 and 33 are citations mentioned in the text regarding the problem of question answering", + "source_ids": [ + 37 + ], + "id": "Name: references 5 11 33\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "3", + "entity_type": "MEASUREMENT", + "description": "3 is a numerical value that serves as the issue number of a publication and also represents the starting page number in an example range.", + "source_ids": [ + 258, + 38, + 199 + ], + "id": "Name: 3\nType: MEASUREMENT" + }, + { + "entity_name": "formula (1)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable A as a function of D and q. LaTeX: 𝐴 = S( 𝐷,𝑞 ) (1)", + "source_ids": [ + 39 + ], + "id": "Name: formula (1)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "s", + "entity_type": "PERSON", + "description": "s is an entity described as needing to navigate sequential page content and logical hierarchy to synthesize a response", + "source_ids": [ + 40 + ], + "id": "Name: s\nType: PERSON" + }, + { + "entity_name": "d", + "entity_type": "TASK_OR_PROBLEM", + "description": "d represents the logical hierarchy that s must navigate to synthesize a response", + "source_ids": [ + 40 + ], + "id": "Name: d\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "3.2 information foraging theory", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Preliminaries' within the BookRAG paper, this section formalizes the foundational Information Foraging Theory (IFT) used to model user behavior in complex document QA tasks.", + "source_ids": [ + 41 + ], + "id": "Name: 3.2 information foraging theory\nType: SECTION_TITLE" + }, + { + "entity_name": "animal foraging", + "entity_type": "TASK_OR_PROBLEM", + "description": "animal foraging is the process used as an analogy to explain how users access information in the context of information foraging theory", + "source_ids": [ + 42 + ], + "id": "Name: animal foraging\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "information scent", + "entity_type": "CONCEPT", + "description": "Information scent refers to the cues, such as keywords, icons, and rich information provided by entities and relations, that guide users and experts in navigating content by leading them toward promising sections.", + "source_ids": [ + 51, + 42, + 43 + ], + "id": "Name: information scent\nType: CONCEPT" + }, + { + "entity_name": "information patches", + "entity_type": "CONCEPT", + "description": "Information patches are clusters of content, such as sections in handbooks, that serve as hierarchical tree nodes providing native contexts for information seeking. These promising sections enable users and experts to navigate effectively between different parts of a handbook.", + "source_ids": [ + 51, + 42, + 43 + ], + "id": "Name: information patches\nType: CONCEPT" + }, + { + "entity_name": "handbooks", + "entity_type": "PRODUCT", + "description": "handbooks are mentioned as containing sections that serve as information patches", + "source_ids": [ + 42 + ], + "id": "Name: handbooks\nType: PRODUCT" + }, + { + "entity_name": "keywords", + "entity_type": "CONCEPT", + "description": "keywords are identified as specific examples of information scent cues used by users", + "source_ids": [ + 42 + ], + "id": "Name: keywords\nType: CONCEPT" + }, + { + "entity_name": "icons", + "entity_type": "CONCEPT", + "description": "icons are identified as specific examples of information scent cues used by users", + "source_ids": [ + 42 + ], + "id": "Name: icons\nType: CONCEPT" + }, + { + "entity_name": "sections", + "entity_type": "CONCEPT", + "description": "sections are described as parts of handbooks that function as information patches", + "source_ids": [ + 42 + ], + "id": "Name: sections\nType: CONCEPT" + }, + { + "entity_name": "reference 42", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 42 is the citation source for information foraging theory mentioned in the text", + "source_ids": [ + 42 + ], + "id": "Name: reference 42\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "experts", + "entity_type": "PERSON", + "description": "experts are individuals seeking a solution to a specific problem within a large technical handbook", + "source_ids": [ + 43 + ], + "id": "Name: experts\nType: PERSON" + }, + { + "entity_name": "large technical handbook", + "entity_type": "BOOK", + "description": "the large technical handbook is the source material containing the problem and information patches", + "source_ids": [ + 43 + ], + "id": "Name: large technical handbook\nType: BOOK" + }, + { + "entity_name": "key terms", + "entity_type": "CONCEPT", + "description": "key terms are extracted by experts to act as information scent", + "source_ids": [ + 43 + ], + "id": "Name: key terms\nType: CONCEPT" + }, + { + "entity_name": "final answer", + "entity_type": "CONCEPT", + "description": "the final answer is the result formulated by experts after analyzing the content within the information patches", + "source_ids": [ + 43 + ], + "id": "Name: final answer\nType: CONCEPT" + }, + { + "entity_name": "problem", + "entity_type": "TASK_OR_PROBLEM", + "description": "a specific problem is the target issue that experts are seeking to solve within the handbook", + "source_ids": [ + 43 + ], + "id": "Name: problem\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "diverse content", + "entity_type": "CONCEPT", + "description": "diverse content refers to the varied information found within the information patches that experts analyze", + "source_ids": [ + 43 + ], + "id": "Name: diverse content\nType: CONCEPT" + }, + { + "entity_name": "precise knowledge", + "entity_type": "CONCEPT", + "description": "precise knowledge is the specific information extracted from the diverse content to help formulate the answer", + "source_ids": [ + 43 + ], + "id": "Name: precise knowledge\nType: CONCEPT" + }, + { + "entity_name": "3.3 rag workflow", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Preliminaries' within the BookRAG paper, this section details the general operational workflow of Retrieval-Augmented Generation (RAG) systems, serving as a foundational context for the proposed hierarchical approach.", + "source_ids": [ + 44 + ], + "id": "Name: 3.3 rag workflow\nType: SECTION_TITLE" + }, + { + "entity_name": "retrieval augmented generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval augmented generation is a system framework described as operating in a two phase process", + "source_ids": [ + 45 + ], + "id": "Name: retrieval augmented generation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "offline indexing phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "offline indexing phase is the first phase where unstructured corpus data is organized into a structured index", + "source_ids": [ + 45 + ], + "id": "Name: offline indexing phase\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "online retrieval phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "online retrieval phase is the second phase where the system retrieves relevant components based on a user query", + "source_ids": [ + 45 + ], + "id": "Name: online retrieval phase\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "vector databases", + "entity_type": "SOFTWARE", + "description": "vector databases are mentioned as a possible form of structured index in the offline indexing phase", + "source_ids": [ + 45 + ], + "id": "Name: vector databases\nType: SOFTWARE" + }, + { + "entity_name": "llm", + "entity_type": "SOFTWARE", + "description": "LLM is a software component that functions as a Large Language Model used to generate output informed by retrieved components during the online retrieval phase. It serves as a tool to extract entities and relations when processing text-only nodes and is also employed to select the canonical entity when multiple aliases are identified.", + "source_ids": [ + 75, + 45, + 63 + ], + "id": "Name: llm\nType: SOFTWARE" + }, + { + "entity_name": "document s native tree topology", + "entity_type": "TASK_OR_PROBLEM", + "description": "document s native tree topology is the logical structure that the proposed approach seeks to integrate with retrieval structures", + "source_ids": [ + 45 + ], + "id": "Name: document s native tree topology\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "unstructured corpus data", + "entity_type": "DATASET_OR_CORPUS", + "description": "unstructured corpus data is the input material organized into a structured index during the offline indexing phase", + "source_ids": [ + 45 + ], + "id": "Name: unstructured corpus data\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "text chunks", + "entity_type": "DATASET_OR_CORPUS", + "description": "text chunks are examples of relevant components retrieved during the online retrieval phase", + "source_ids": [ + 45 + ], + "id": "Name: text chunks\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "subgraphs", + "entity_type": "DATASET_OR_CORPUS", + "description": "subgraphs are examples of relevant components retrieved during the online retrieval phase", + "source_ids": [ + 45 + ], + "id": "Name: subgraphs\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "document", + "entity_type": "TASK_OR_PROBLEM", + "description": "The document serves as the source of the original logical hierarchy and native tree topology referenced in the text, and it is the text being processed by the select by entity and select by section methods.", + "source_ids": [ + 104, + 45 + ], + "id": "Name: document\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "4 bookindex", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the main paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section details the novel index structure named BookIndex. It explains how the approach extracts a hierarchical tree from documents to serve as a table of contents, utilizes graphs to capture entity relationships, and maps entities to tree nodes.", + "source_ids": [ + 46 + ], + "id": "Name: 4 bookindex\nType: SECTION_TITLE" + }, + { + "entity_name": "bookindex", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "Bookindex is a novel hierarchical structure-aware index introduced in this work that builds a tree from documents to act as a table of contents and uses graphs to capture intricate relationships between entities. Serving as the core subject of section 4.1, it is designed to capture explicit logical hierarchy and intricate entity relations within complex documents.", + "source_ids": [ + 50, + 46 + ], + "id": "Name: bookindex\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "hierarchical tree", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The structural method used within BookIndex to organize document content from different granularity levels, serving the role of a table of contents.", + "source_ids": [ + 46 + ], + "id": "Name: hierarchical tree\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "graph", + "entity_type": "TECHNOLOGY", + "description": "The data structure employed by BookIndex to capture and represent the intricate relationships between entities within the document hierarchy.", + "source_ids": [ + 46 + ], + "id": "Name: graph\nType: TECHNOLOGY" + }, + { + "entity_name": "tree construction", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Tree construction is the first stage of a two-stage process used to parse document layout and establish hierarchical nodes categorized by type. This specific sequential process, detailed in section 4.2, serves as the initial phase of the BookIndex construction process, where document layout is analyzed to create these hierarchical structures.", + "source_ids": [ + 50, + 53, + 47 + ], + "id": "Name: tree construction\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "graph construction", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Graph construction is the second stage of the BookIndex process, following tree construction, which focuses on extracting fine-grained entity knowledge from tree nodes and refining it through gradient-based entity resolution as detailed in section 4.1.", + "source_ids": [ + 50, + 61, + 47 + ], + "id": "Name: graph construction\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "logical hierarchy", + "entity_type": "CONCEPT", + "description": "Logical hierarchy is the explicit structure within complex documents that the bookindex is designed to capture, serving as the foundational framework that grounds semantic entities.", + "source_ids": [ + 52, + 47 + ], + "id": "Name: logical hierarchy\nType: CONCEPT" + }, + { + "entity_name": "hierarchical nodes", + "entity_type": "CONCEPT", + "description": "hierarchical nodes are the categorized units established by the tree construction process", + "source_ids": [ + 47 + ], + "id": "Name: hierarchical nodes\nType: CONCEPT" + }, + { + "entity_name": "fine grained entity knowledge", + "entity_type": "CONCEPT", + "description": "fine grained entity knowledge is the detailed information extracted from tree nodes during the graph construction process", + "source_ids": [ + 47 + ], + "id": "Name: fine grained entity knowledge\nType: CONCEPT" + }, + { + "entity_name": "figure 2", + "entity_type": "IMAGE", + "description": "Figure 2 is an image referenced in the text, serving as a visual representation to illustrate the bookindex construction process, provide an example of the bookindex, depict the layout parsing phase, and show the processing of a new entity in a knowledge graph, with its title being the subject of a question.", + "source_ids": [ + 76, + 48, + 52, + 245, + 59 + ], + "id": "Name: figure 2\nType: IMAGE" + }, + { + "entity_name": "bookindex construction process", + "entity_type": "TASK_OR_PROBLEM", + "description": "the bookindex construction process is a phase involving tree construction and graph construction", + "source_ids": [ + 48 + ], + "id": "Name: bookindex construction process\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "tree construction", + "entity_type": "TASK_OR_PROBLEM", + "description": "Tree construction is a component of the bookindex construction process derived from layout parsing and section filtering, representing the top section of the diagram that details the initial phase of building the index from document layouts.", + "source_ids": [ + 48, + 49 + ], + "id": "Name: tree construction\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "layout parsing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Layout parsing is a method used to derive tree construction by extracting visual elements such as tables, text, titles, and images from a document layout. While it identifies blocks as titles, it does not assign their hierarchical level.", + "source_ids": [ + 48, + 49, + 57 + ], + "id": "Name: layout parsing\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "section filtering", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Section filtering is a method used in tree construction, specifically serving as Step 2 in the process where parsed sections are filtered based on title properties such as FontSize and content type, distinguishing between sections and text.", + "source_ids": [ + 48, + 49 + ], + "id": "Name: section filtering\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "graph construction", + "entity_type": "TASK_OR_PROBLEM", + "description": "Graph construction is a component of the bookindex construction process that involves knowledge graph construction and gradient-based entity resolution, as illustrated in the bottom section of the diagram detailing the creation of a knowledge graph for this purpose.", + "source_ids": [ + 48, + 49 + ], + "id": "Name: graph construction\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "kg construction", + "entity_type": "TASK_OR_PROBLEM", + "description": "kg construction is a step involved in graph construction", + "source_ids": [ + 48 + ], + "id": "Name: kg construction\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "gradient based entity resolution", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Gradient based entity resolution is a method involved in graph construction and represents the specific technique or approach described for the algorithm.", + "source_ids": [ + 48, + 69 + ], + "id": "Name: gradient based entity resolution\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "bookindex construction", + "entity_type": "IMAGE", + "description": "A diagram illustrating the process of constructing a book index, divided into Tree Construction and Graph Construction phases.", + "source_ids": [ + 49 + ], + "id": "Name: bookindex construction\nType: IMAGE" + }, + { + "entity_name": "title: method", + "entity_type": "SECTION_TITLE", + "description": "A specific text label identified during parsing with FontSize 14.", + "source_ids": [ + 49 + ], + "id": "Name: title: method\nType: SECTION_TITLE" + }, + { + "entity_name": "title: experiment", + "entity_type": "SECTION_TITLE", + "description": "A specific text label identified during parsing with FontSize 14.", + "source_ids": [ + 49 + ], + "id": "Name: title: experiment\nType: SECTION_TITLE" + }, + { + "entity_name": "title: moe layer", + "entity_type": "SECTION_TITLE", + "description": "A specific text label identified during parsing with FontSize 20.", + "source_ids": [ + 49 + ], + "id": "Name: title: moe layer\nType: SECTION_TITLE" + }, + { + "entity_name": "level: 2 type: section", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "An attribute indicating that 'Method' and 'Experiment' titles are classified as Level 2 Sections.", + "source_ids": [ + 49 + ], + "id": "Name: level: 2 type: section\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "level: none type: text", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "An attribute associated with 'MOE Layer', marked with a red cross, indicating it was rejected or not treated as a section.", + "source_ids": [ + 49 + ], + "id": "Name: level: none type: text\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "tree node", + "entity_type": "HARDWARE", + "description": "Visual element representing nodes in the tree structure shown in the legend and the resulting BookIndex.", + "source_ids": [ + 49 + ], + "id": "Name: tree node\nType: HARDWARE" + }, + { + "entity_name": "gt-link", + "entity_type": "SOFTWARE", + "description": "Legend item representing Ground Truth links between entities in the diagram.", + "source_ids": [ + 49 + ], + "id": "Name: gt-link\nType: SOFTWARE" + }, + { + "entity_name": "relation", + "entity_type": "DATASET_OR_CORPUS", + "description": "Legend item representing relationships between entities.", + "source_ids": [ + 49 + ], + "id": "Name: relation\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "kg construction", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Step 1 in Graph Construction showing the generation of a Knowledge Graph from Tree Nodes.", + "source_ids": [ + 49 + ], + "id": "Name: kg construction\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "gradient-based entity resolution", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Gradient-based entity resolution is a novel algorithmic method used to refine extracted entity knowledge by resolving ambiguities or duplicates through gradient optimization. As a specific technique mentioned in the title, it plays a crucial role in resolving entity fragmentation by utilizing gradient-based approaches to refine raw Knowledge Graphs. This method is implemented as Step 2 in the Graph Construction process, where it facilitates similarity matching and the merging of entities to ensure data accuracy and coherence.", + "source_ids": [ + 65, + 61, + 49 + ], + "id": "Name: gradient-based entity resolution\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "similarity", + "entity_type": "EVALUATION_METRIC", + "description": "Y-axis label of the chart in the Gradient-based Entity Resolution step.", + "source_ids": [ + 49 + ], + "id": "Name: similarity\nType: EVALUATION_METRIC" + }, + { + "entity_name": "entity", + "entity_type": "DATASET_OR_CORPUS", + "description": "X-axis label of the chart in the Gradient-based Entity Resolution step.", + "source_ids": [ + 49 + ], + "id": "Name: entity\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "merge", + "entity_type": "TASK_OR_PROBLEM", + "description": "Action performed to combine similar entities (e.g., e2 and e9) into a single resolved entity.", + "source_ids": [ + 49 + ], + "id": "Name: merge\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "image cref='#/texts/52'", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 49 + ], + "id": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "entity_name": "4.1 overview of bookindex", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'BOOKINDEX', this section provides a high-level introduction to the proposed BookIndex, defining its hierarchical structure-aware nature and outlining its two-stage construction process (Tree Construction and Graph Construction) for capturing logical hierarchies and entity relations in complex documents.", + "source_ids": [ + 50 + ], + "id": "Name: 4.1 overview of bookindex\nType: SECTION_TITLE" + }, + { + "entity_name": "tree structure", + "entity_type": "TASK_OR_PROBLEM", + "description": "tree structure represents the set of nodes derived from the document s explicit logical hierarchy", + "source_ids": [ + 51 + ], + "id": "Name: tree structure\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "knowledge graph", + "entity_type": "SOFTWARE", + "description": "knowledge graph is a structure that captures fine grained entities and their relations within the document", + "source_ids": [ + 51 + ], + "id": "Name: knowledge graph\nType: SOFTWARE" + }, + { + "entity_name": "graph tree link", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "graph tree link gt link is a mechanism that links entities to specific tree nodes from which they were extracted", + "source_ids": [ + 51 + ], + "id": "Name: graph tree link\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "titles", + "entity_type": "SECTION_TITLE", + "description": "titles are examples of nodes in the document s explicit logical hierarchy", + "source_ids": [ + 51 + ], + "id": "Name: titles\nType: SECTION_TITLE" + }, + { + "entity_name": "sections", + "entity_type": "SECTION_TITLE", + "description": "sections are examples of nodes in the document s explicit logical hierarchy", + "source_ids": [ + 51 + ], + "id": "Name: sections\nType: SECTION_TITLE" + }, + { + "entity_name": "e t", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "e t denotes the nesting relationships in the tree structure and represents the set of edges in the tree.", + "source_ids": [ + 51, + 102 + ], + "id": "Name: e t\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "v", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "v represents the set of entities in the graph g, which correspond to the fine-grained entities within the knowledge graph.", + "source_ids": [ + 51, + 77 + ], + "id": "Name: v\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "e g", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "e g denotes the relations in the knowledge graph", + "source_ids": [ + 51 + ], + "id": "Name: e g\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "m v", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "m v is the graph tree link function linking entities to tree nodes", + "source_ids": [ + 51 + ], + "id": "Name: m v\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "", + "entity_type": "UNKNOWN", + "description": "The entity's description is empty, containing no information to synthesize.", + "source_ids": [ + 73, + 109, + 112, + 51, + 57, + 122 + ], + "id": "Name: \nType: UNKNOWN" + }, + { + "entity_name": "navigation", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 51 + ], + "id": "Name: navigation\nType: UNKNOWN" + }, + { + "entity_name": "tree component", + "entity_type": "SOFTWARE", + "description": "the tree component is a part of the bookindex that organizes documents into a hierarchical structure", + "source_ids": [ + 52 + ], + "id": "Name: tree component\nType: SOFTWARE" + }, + { + "entity_name": "graph component", + "entity_type": "SOFTWARE", + "description": "the graph component is a part of the bookindex composed of entities and relations extracted from document nodes", + "source_ids": [ + 52 + ], + "id": "Name: graph component\nType: SOFTWARE" + }, + { + "entity_name": "gt link", + "entity_type": "TECHNOLOGY", + "description": "GT Link is a technology feature illustrated by blue dotted lines that connects entities to their corresponding tree nodes. It serves as a formalized mechanism, denoted as M, used to complete the book index and to link sections to entities.", + "source_ids": [ + 104, + 52, + 77 + ], + "id": "Name: gt link\nType: TECHNOLOGY" + }, + { + "entity_name": "text", + "entity_type": "PRODUCT", + "description": "Text is a type of content block that serves as a leaf node within the document structure and is identified as a final node type retained within the nodes of the tree.", + "source_ids": [ + 58, + 52 + ], + "id": "Name: text\nType: PRODUCT" + }, + { + "entity_name": "tables", + "entity_type": "PRODUCT", + "description": "tables are a type of content block serving as a leaf node within the document structure", + "source_ids": [ + 52 + ], + "id": "Name: tables\nType: PRODUCT" + }, + { + "entity_name": "images", + "entity_type": "PRODUCT", + "description": "images are a type of content block serving as a leaf node within the document structure", + "source_ids": [ + 52 + ], + "id": "Name: images\nType: PRODUCT" + }, + { + "entity_name": "section nodes", + "entity_type": "PRODUCT", + "description": "section nodes are hierarchical nodes within the document structure that contain content blocks", + "source_ids": [ + 52 + ], + "id": "Name: section nodes\nType: PRODUCT" + }, + { + "entity_name": "content blocks", + "entity_type": "PRODUCT", + "description": "content blocks are the items text tables images that serve as leaf nodes in the hierarchy", + "source_ids": [ + 52 + ], + "id": "Name: content blocks\nType: PRODUCT" + }, + { + "entity_name": "leaf nodes", + "entity_type": "PRODUCT", + "description": "leaf nodes are the terminal elements in the hierarchical structure containing content blocks", + "source_ids": [ + 52 + ], + "id": "Name: leaf nodes\nType: PRODUCT" + }, + { + "entity_name": "semantic entities", + "entity_type": "CONCEPT", + "description": "semantic entities are the extracted entities grounded within the document s logical hierarchy by gt link", + "source_ids": [ + 52 + ], + "id": "Name: semantic entities\nType: CONCEPT" + }, + { + "entity_name": "4.2 tree construction", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'BookIndex' and the first stage of its construction process, this section details the method for parsing document layouts to establish hierarchical nodes categorized by type.", + "source_ids": [ + 53 + ], + "id": "Name: 4.2 tree construction\nType: SECTION_TITLE" + }, + { + "entity_name": "t", + "entity_type": "TASK_OR_PROBLEM", + "description": "t is a structured hierarchical tree that is the result of transforming a raw document", + "source_ids": [ + 54 + ], + "id": "Name: t\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "raw document", + "entity_type": "PRODUCT", + "description": "raw document is the initial input that undergoes transformation into a structured hierarchical tree", + "source_ids": [ + 54 + ], + "id": "Name: raw document\nType: PRODUCT" + }, + { + "entity_name": "robust layout parsing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "robust layout parsing is a key step involved in transforming the raw document into a structured hierarchical tree", + "source_ids": [ + 54 + ], + "id": "Name: robust layout parsing\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "intelligent section filtering", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "intelligent section filtering is a key step involved in transforming the raw document into a structured hierarchical tree", + "source_ids": [ + 54 + ], + "id": "Name: intelligent section filtering\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "task or problem", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 54 + ], + "id": "Name: task or problem\nType: UNKNOWN" + }, + { + "entity_name": "4.2.1 layout parsing", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Tree Construction' within the 'BOOKINDEX' chapter, this section details the initial phase of transforming raw documents into structured hierarchical trees using layout analysis and recognition models to identify and organize diverse content blocks.", + "source_ids": [ + 55 + ], + "id": "Name: 4.2.1 layout parsing\nType: SECTION_TITLE" + }, + { + "entity_name": "layout analysis", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A specific technique employed in section 4.2.1 to understand the spatial arrangement of elements within document pages.", + "source_ids": [ + 55 + ], + "id": "Name: layout analysis\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "recognition models", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The computational models utilized in section 4.2.1 to recognize and classify different types of content blocks such as text, tables, and images.", + "source_ids": [ + 55 + ], + "id": "Name: recognition models\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "document d", + "entity_type": "TASK_OR_PROBLEM", + "description": "The input data object (a collection of pages) that serves as the target for processing in section 4.2.1.", + "source_ids": [ + 55 + ], + "id": "Name: document d\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "the output", + "entity_type": "TASK_OR_PROBLEM", + "description": "the output is described as a sequence of primitive", + "source_ids": [ + 56 + ], + "id": "Name: the output\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "primitive", + "entity_type": "CONCEPT", + "description": "primitive is a term used to describe the components of the output sequence", + "source_ids": [ + 56 + ], + "id": "Name: primitive\nType: CONCEPT" + }, + { + "entity_name": "section filtering", + "entity_type": "TASK_OR_PROBLEM", + "description": "section filtering is a phase that processes an initial sequence to identify a document s logically hierarchical structure", + "source_ids": [ + 57 + ], + "id": "Name: section filtering\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "title", + "entity_type": "SECTION_TITLE", + "description": "title refers to blocks identified by layout parsing that require hierarchical level assignment", + "source_ids": [ + 57 + ], + "id": "Name: title\nType: SECTION_TITLE" + }, + { + "entity_name": "text", + "entity_type": "SECTION_TITLE", + "description": "text is a node type used to re classify erroneous title blocks such as descriptive text within images", + "source_ids": [ + 57 + ], + "id": "Name: text\nType: SECTION_TITLE" + }, + { + "entity_name": "image", + "entity_type": "IMAGE", + "description": "An image is a type of block identified during the layout parsing phase, serving as a specific node type that indicates the presence of visual elements requiring VLM-based extraction. It can also refer to a location within a document where descriptive text might be erroneously parsed as a title, and it functions as a filter type used for visual elements.", + "source_ids": [ + 57, + 258, + 59, + 63 + ], + "id": "Name: image\nType: IMAGE" + }, + { + "entity_name": "table", + "entity_type": "TABLE", + "description": "A table is a document element that can refer to borderless table headers, which might be erroneously parsed as a title, or it can serve as a filter type used for tabular data.", + "source_ids": [ + 57, + 258 + ], + "id": "Name: table\nType: TABLE" + }, + { + "entity_name": "b", + "entity_type": "DATASET_OR_CORPUS", + "description": "b represents the candidate subset of blocks selected for llm based analysis", + "source_ids": [ + 57 + ], + "id": "Name: b\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "c", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "c represents the content of the candidates analyzed by the LLM and also denotes explicit constraints such as modal types and page ranges generated during a plan.", + "source_ids": [ + 57, + 102 + ], + "id": "Name: c\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "f", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "f represents the layout features of the candidates analyzed by the llm", + "source_ids": [ + 57 + ], + "id": "Name: f\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "4 2 2", + "entity_type": "SECTION_TITLE", + "description": "4 2 2 is the section identifier for the section filtering phase", + "source_ids": [ + 57 + ], + "id": "Name: 4 2 2\nType: SECTION_TITLE" + }, + { + "entity_name": "b title", + "entity_type": "DATASET_OR_CORPUS", + "description": "b title is a candidate subset of blocks where the type is title selected for llm based analysis", + "source_ids": [ + 57 + ], + "id": "Name: b title\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "l", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "l represents the actual hierarchical level of a block ranging from 1 to infinity", + "source_ids": [ + 57 + ], + "id": "Name: l\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "none", + "entity_type": "SECTION_TITLE", + "description": "none is a value indicating that a block has no hierarchical level", + "source_ids": [ + 57 + ], + "id": "Name: none\nType: SECTION_TITLE" + }, + { + "entity_name": "tree", + "entity_type": "TASK_OR_PROBLEM", + "description": "A tree is a definitive structure constructed from blocks consisting of nodes and edges that represent content and relationships, serving as the data structure on which operators operate.", + "source_ids": [ + 58, + 102 + ], + "id": "Name: tree\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "node set", + "entity_type": "TASK_OR_PROBLEM", + "description": "the node set is composed of all blocks from the filtering and re classification process retaining content and final node types", + "source_ids": [ + 58 + ], + "id": "Name: node set\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "edge set", + "entity_type": "TASK_OR_PROBLEM", + "description": "the edge set represents the parent child nesting relationships within the tree structure", + "source_ids": [ + 58 + ], + "id": "Name: edge set\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "section", + "entity_type": "PRODUCT", + "description": "section is identified as a final node type retained within the nodes of the tree", + "source_ids": [ + 58 + ], + "id": "Name: section\nType: PRODUCT" + }, + { + "entity_name": "table", + "entity_type": "PRODUCT", + "description": "A table is a specific item type being filtered for in the document and is identified as a final node type retained within the nodes of the tree. It is a specific logical type mentioned in the text that requires the preservation of structural semantics.", + "source_ids": [ + 64, + 58, + 251 + ], + "id": "Name: table\nType: PRODUCT" + }, + { + "entity_name": "image", + "entity_type": "PRODUCT", + "description": "image is identified as a final node type retained within the nodes of the tree", + "source_ids": [ + 58 + ], + "id": "Name: image\nType: PRODUCT" + }, + { + "entity_name": "filtering", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "filtering is a process mentioned as part of the generation of blocks for the node set", + "source_ids": [ + 58 + ], + "id": "Name: filtering\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "re classification", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "re classification is a process mentioned alongside filtering in the creation of the node set", + "source_ids": [ + 58 + ], + "id": "Name: re classification\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "hierarchical levels", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "hierarchical levels are determined values used to infer parent child relationships for section nodes", + "source_ids": [ + 58 + ], + "id": "Name: hierarchical levels\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "document order", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "Document order is a sequential arrangement and factor used to assemble nodes into the final tree structure.", + "source_ids": [ + 58, + 59 + ], + "id": "Name: document order\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "parent child nesting relationships", + "entity_type": "TASK_OR_PROBLEM", + "description": "parent child nesting relationships are the specific connections established by the edge set", + "source_ids": [ + 58 + ], + "id": "Name: parent child nesting relationships\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "content", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "content is an attribute retained by each node in the node set", + "source_ids": [ + 58 + ], + "id": "Name: content\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "final node type", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "final node type is an attribute retained by each node in the node set", + "source_ids": [ + 58 + ], + "id": "Name: final node type\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "node", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 58 + ], + "id": "Name: node\nType: UNKNOWN" + }, + { + "entity_name": "layout parsing phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "layout parsing phase is a process that identifies diverse blocks in a document", + "source_ids": [ + 59 + ], + "id": "Name: layout parsing phase\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "title text table", + "entity_type": "PRODUCT", + "description": "title text table is a type of block identified during the layout parsing phase", + "source_ids": [ + 59 + ], + "id": "Name: title text table\nType: PRODUCT" + }, + { + "entity_name": "section filtering phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "section filtering phase is a process where title candidates are analyzed by the llm", + "source_ids": [ + 59 + ], + "id": "Name: section filtering phase\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "method", + "entity_type": "SECTION_TITLE", + "description": "method is a title candidate analyzed during the section filtering phase", + "source_ids": [ + 59 + ], + "id": "Name: method\nType: SECTION_TITLE" + }, + { + "entity_name": "experiment", + "entity_type": "SECTION_TITLE", + "description": "experiment is a title candidate analyzed during the section filtering phase", + "source_ids": [ + 59 + ], + "id": "Name: experiment\nType: SECTION_TITLE" + }, + { + "entity_name": "moe layer", + "entity_type": "SECTION_TITLE", + "description": "moe layer is a title candidate that was erroneously tagged as a title but re classified as a text node", + "source_ids": [ + 59 + ], + "id": "Name: moe layer\nType: SECTION_TITLE" + }, + { + "entity_name": "section nodes", + "entity_type": "SECTION_TITLE", + "description": "section nodes are blocks identified as having a specific level in the document hierarchy", + "source_ids": [ + 59 + ], + "id": "Name: section nodes\nType: SECTION_TITLE" + }, + { + "entity_name": "text node", + "entity_type": "SECTION_TITLE", + "description": "text node is a classification for blocks that do not have a specific level in the document hierarchy", + "source_ids": [ + 59 + ], + "id": "Name: text node\nType: SECTION_TITLE" + }, + { + "entity_name": "final tree structure", + "entity_type": "TASK_OR_PROBLEM", + "description": "final tree structure is the result of assembling filtered and classified nodes based on their levels and order", + "source_ids": [ + 59 + ], + "id": "Name: final tree structure\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "fontsize", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "fontsize is a parameter used to describe the size of text blocks such as 14 or 20", + "source_ids": [ + 59 + ], + "id": "Name: fontsize\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "14", + "entity_type": "MEASUREMENT", + "description": "14 is a numerical value mentioned in the text, potentially representing a measurement or count, and serves as the specific font size value associated with the method and experiment blocks.", + "source_ids": [ + 59, + 219 + ], + "id": "Name: 14\nType: MEASUREMENT" + }, + { + "entity_name": "20", + "entity_type": "MEASUREMENT", + "description": "20 is the specific font size value associated with the moe layer block", + "source_ids": [ + 59 + ], + "id": "Name: 20\nType: MEASUREMENT" + }, + { + "entity_name": "level", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "level is a parameter used to define the hierarchy depth of document nodes", + "source_ids": [ + 59 + ], + "id": "Name: level\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "2", + "entity_type": "MEASUREMENT", + "description": "The value 2 serves multiple roles across different contexts: it is the specific level assigned to method and experiment blocks, represents the speedup factor achieved by BookRag compared to Docetl in query latency, and denotes both the issue number and the volume number of a publication.", + "source_ids": [ + 160, + 199, + 59, + 191 + ], + "id": "Name: 2\nType: MEASUREMENT" + }, + { + "entity_name": "none", + "entity_type": "MEASUREMENT", + "description": "none is the specific level value assigned to the moe layer block indicating no hierarchy level", + "source_ids": [ + 59 + ], + "id": "Name: none\nType: MEASUREMENT" + }, + { + "entity_name": "4", + "entity_type": "MEASUREMENT", + "description": "4 is a numerical value mentioned in the text though its specific context or unit is not provided", + "source_ids": [ + 60 + ], + "id": "Name: 4\nType: MEASUREMENT" + }, + { + "entity_name": "4.3 graph construction", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'BOOKINDEX' and the second stage of the proposed BookIndex construction process, this section details the method for extracting fine-grained entity knowledge from hierarchical tree nodes and refining it using a novel gradient-based entity resolution technique.", + "source_ids": [ + 61 + ], + "id": "Name: 4.3 graph construction\nType: SECTION_TITLE" + }, + { + "entity_name": "tree t", + "entity_type": "TASK_OR_PROBLEM", + "description": "tree t is a structure that is established before proceeding to the next step", + "source_ids": [ + 62 + ], + "id": "Name: tree t\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "knowledge graph g", + "entity_type": "TASK_OR_PROBLEM", + "description": "knowledge graph g is a structure that is populated by extracting and refining entities from the tree nodes", + "source_ids": [ + 62 + ], + "id": "Name: knowledge graph g\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "tree nodes", + "entity_type": "TASK_OR_PROBLEM", + "description": "tree nodes are the components within tree t from which entities are extracted and refined", + "source_ids": [ + 62 + ], + "id": "Name: tree nodes\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "4.3.1 kg construction", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Graph Construction' within the 'BOOKINDEX' chapter, this section details the specific algorithm for populating the Knowledge Graph by iterating through tree nodes and extracting subgraphs based on content modality (text or visual).", + "source_ids": [ + 63 + ], + "id": "Name: 4.3.1 kg construction\nType: SECTION_TITLE" + }, + { + "entity_name": "tree t", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The hierarchical structure previously established that serves as the source of nodes to be processed for graph construction.", + "source_ids": [ + 63 + ], + "id": "Name: tree t\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "vision language model", + "entity_type": "SOFTWARE", + "description": "VLM employed specifically to extract visual knowledge from nodes containing image elements.", + "source_ids": [ + 63 + ], + "id": "Name: vision language model\nType: SOFTWARE" + }, + { + "entity_name": "mapping m", + "entity_type": "EQUATION_OR_FORMULA", + "description": "Mapping m is the final aggregation process defined as v to p n, which links entities to structural locations by constructing a mapping structure that records the origin tree node for every extracted entity.", + "source_ids": [ + 77, + 63 + ], + "id": "Name: mapping m\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "formula", + "entity_type": "PRODUCT", + "description": "formula is a specific logical type mentioned in the text that requires preservation of structural semantics", + "source_ids": [ + 64 + ], + "id": "Name: formula\nType: PRODUCT" + }, + { + "entity_name": "v table", + "entity_type": "PRODUCT", + "description": "v table is a distinct typed entity representing the table itself created to preserve structural semantics", + "source_ids": [ + 64 + ], + "id": "Name: v table\nType: PRODUCT" + }, + { + "entity_name": "row", + "entity_type": "PRODUCT", + "description": "row is a component of table nodes that is explicitly extracted as a distinct entity", + "source_ids": [ + 64 + ], + "id": "Name: row\nType: PRODUCT" + }, + { + "entity_name": "column", + "entity_type": "PRODUCT", + "description": "column is a component of table nodes that is explicitly extracted as a distinct entity", + "source_ids": [ + 64 + ], + "id": "Name: column\nType: PRODUCT" + }, + { + "entity_name": "header", + "entity_type": "PRODUCT", + "description": "header refers to row and column headers in table nodes that are explicitly extracted as distinct entities", + "source_ids": [ + 64 + ], + "id": "Name: header\nType: PRODUCT" + }, + { + "entity_name": "structural semantics", + "entity_type": "CONCEPT", + "description": "structural semantics refers to the meaning preserved for specific logical types in the described process", + "source_ids": [ + 64 + ], + "id": "Name: structural semantics\nType: CONCEPT" + }, + { + "entity_name": "logical types", + "entity_type": "CONCEPT", + "description": "logical types are categories of entities such as table and formula that require specific handling", + "source_ids": [ + 64 + ], + "id": "Name: logical types\nType: CONCEPT" + }, + { + "entity_name": "node", + "entity_type": "CONCEPT", + "description": "node refers to a specific point in the data structure where content is extracted", + "source_ids": [ + 64 + ], + "id": "Name: node\nType: CONCEPT" + }, + { + "entity_name": "vertex", + "entity_type": "CONCEPT", + "description": "vertex refers to the primary node v table to which other entities are linked", + "source_ids": [ + 64 + ], + "id": "Name: vertex\nType: CONCEPT" + }, + { + "entity_name": "containedin", + "entity_type": "RELATIONSHIP_TYPE", + "description": "containedin is the specific relationship type used to link row and column headers to the table entity", + "source_ids": [ + 64 + ], + "id": "Name: containedin\nType: RELATIONSHIP_TYPE" + }, + { + "entity_name": "4.3.2 gradient-based entity resolution", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Graph Construction' within the 'BOOKINDEX' chapter, this section details a robust Entity Resolution (ER) process designed to identify and merge fragmented conceptual entities in a Knowledge Graph, addressing challenges like abbreviations and co-references.", + "source_ids": [ + 65 + ], + "id": "Name: 4.3.2 gradient-based entity resolution\nType: SECTION_TITLE" + }, + { + "entity_name": "entity resolution", + "entity_type": "TASK_OR_PROBLEM", + "description": "Entity resolution is the task or problem addressed by the prompt in figure 13, serving as a process during which GT Link is refined by merging entities into canonical entities. It addresses the core challenge of identifying and merging fragmented entities caused by abbreviations, co-references, or varied occurrences to ensure a well-constructed Knowledge Graph.", + "source_ids": [ + 65, + 284, + 77 + ], + "id": "Name: entity resolution\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "er methods", + "entity_type": "TASK_OR_PROBLEM", + "description": "er methods are conventional methods for entity resolution that are computationally expensive", + "source_ids": [ + 66 + ], + "id": "Name: er methods\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "dirty er", + "entity_type": "TASK_OR_PROBLEM", + "description": "dirty er is a term used to describe batch processing across multiple data sources for entity resolution", + "source_ids": [ + 66 + ], + "id": "Name: dirty er\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "a", + "entity_type": "TASK_OR_PROBLEM", + "description": "a is an example entity used to illustrate the merging of multiple entities in the entity resolution process", + "source_ids": [ + 66 + ], + "id": "Name: a\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "b", + "entity_type": "TASK_OR_PROBLEM", + "description": "b is an example entity used to illustrate the merging of multiple entities in the entity resolution process", + "source_ids": [ + 66 + ], + "id": "Name: b\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "c", + "entity_type": "TASK_OR_PROBLEM", + "description": "c is an example entity used to illustrate the merging of multiple entities in the entity resolution process", + "source_ids": [ + 66 + ], + "id": "Name: c\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "12", + "entity_type": "PUBLICATION_VENUE", + "description": "12 is a citation reference mentioned in the text regarding entity resolution methods", + "source_ids": [ + 66 + ], + "id": "Name: 12\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "a b", + "entity_type": "TASK_OR_PROBLEM", + "description": "a b is a specific pairwise comparison example between entities a and b", + "source_ids": [ + 66 + ], + "id": "Name: a b\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "a c", + "entity_type": "TASK_OR_PROBLEM", + "description": "a c is a specific pairwise comparison example between entities a and c", + "source_ids": [ + 66 + ], + "id": "Name: a c\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "b c", + "entity_type": "TASK_OR_PROBLEM", + "description": "b c is a specific pairwise comparison example between entities b and c", + "source_ids": [ + 66 + ], + "id": "Name: b c\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "o n 2", + "entity_type": "MEASUREMENT", + "description": "o n 2 represents the quadratic complexity of the number of pairwise comparisons required", + "source_ids": [ + 66 + ], + "id": "Name: o n 2\nType: MEASUREMENT" + }, + { + "entity_name": "gradient based er method", + "entity_type": "TECHNOLOGY", + "description": "a gradient based entity resolution method employed to process a single document incrementally", + "source_ids": [ + 67 + ], + "id": "Name: gradient based er method\nType: TECHNOLOGY" + }, + { + "entity_name": "clean er", + "entity_type": "TASK_OR_PROBLEM", + "description": "a simplified version of the entity resolution task used as the basis for the incremental process", + "source_ids": [ + 67 + ], + "id": "Name: clean er\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "database", + "entity_type": "SOFTWARE", + "description": "a storage system containing already processed entities against which new entities are compared", + "source_ids": [ + 67 + ], + "id": "Name: database\nType: SOFTWARE" + }, + { + "entity_name": "top k most relevant candidates", + "entity_type": "EVALUATION_METRIC", + "description": "a set of the most relevant entities used for reranking a new entity in the incremental process", + "source_ids": [ + 67 + ], + "id": "Name: top k most relevant candidates\nType: EVALUATION_METRIC" + }, + { + "entity_name": "entity", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "a single new entity being extracted in the incremental process", + "source_ids": [ + 67 + ], + "id": "Name: entity\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "quadratic batch problem", + "entity_type": "TASK_OR_PROBLEM", + "description": "the original complex problem that the incremental method transforms into a simpler task", + "source_ids": [ + 67 + ], + "id": "Name: quadratic batch problem\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "repeated lookup task", + "entity_type": "TASK_OR_PROBLEM", + "description": "the simplified task resulting from transforming the quadratic batch problem", + "source_ids": [ + 67 + ], + "id": "Name: repeated lookup task\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "scoring patterns", + "entity_type": "EVALUATION_METRIC", + "description": "distinct observable patterns yielded by the incremental process when reranking entities", + "source_ids": [ + 67 + ], + "id": "Name: scoring patterns\nType: EVALUATION_METRIC" + }, + { + "entity_name": "incremental process", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 67 + ], + "id": "Name: incremental process\nType: UNKNOWN" + }, + { + "entity_name": "5", + "entity_type": "MEASUREMENT", + "description": "5 is a numerical value mentioned in the text potentially representing a count score or measurement", + "source_ids": [ + 68 + ], + "id": "Name: 5\nType: MEASUREMENT" + }, + { + "entity_name": "algorithm 1", + "entity_type": "TASK_OR_PROBLEM", + "description": "Algorithm 1 is a gradient-based entity resolution method and the entity resolution process described in the text.", + "source_ids": [ + 75, + 69 + ], + "id": "Name: algorithm 1\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "kg g", + "entity_type": "TASK_OR_PROBLEM", + "description": "kg g is a knowledge graph that serves as the input for the described process", + "source_ids": [ + 70 + ], + "id": "Name: kg g\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "new entity v n", + "entity_type": "TASK_OR_PROBLEM", + "description": "new entity v n is a new entity being introduced into the system", + "source_ids": [ + 70 + ], + "id": "Name: new entity v n\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "rerank model r", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "rerank model r is a model used to rerank entities in the process", + "source_ids": [ + 70 + ], + "id": "Name: rerank model r\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "entity vector database db", + "entity_type": "DATASET_OR_CORPUS", + "description": "entity vector database db is a database storing entity vectors", + "source_ids": [ + 70 + ], + "id": "Name: entity vector database db\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "vector search number top k", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "vector search number top k is a parameter defining the number of top results for vector search", + "source_ids": [ + 70 + ], + "id": "Name: vector search number top k\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "threshold of gradient g", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "threshold of gradient g is a threshold value used for gradient calculations", + "source_ids": [ + 70 + ], + "id": "Name: threshold of gradient g\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "kg", + "entity_type": "TASK_OR_PROBLEM", + "description": "kg is the abbreviation for the knowledge graph mentioned as input, referring to the system where entities are processed, compared, and merged.", + "source_ids": [ + 76, + 70 + ], + "id": "Name: kg\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "g", + "entity_type": "TASK_OR_PROBLEM", + "description": "g is a specific instance or variable name for the knowledge graph and serves as a data structure or set that is updated and returned at the end of the process.", + "source_ids": [ + 75, + 70 + ], + "id": "Name: g\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "v", + "entity_type": "TASK_OR_PROBLEM", + "description": "v is the variable representing the new entity", + "source_ids": [ + 70 + ], + "id": "Name: v\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "n", + "entity_type": "TASK_OR_PROBLEM", + "description": "In the context of the task or problem labeled n, this identifier serves as a subscript or reference to a new entity denoted as v, while also representing the set of nodes within the document structure.", + "source_ids": [ + 104, + 70 + ], + "id": "Name: n\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "r", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "r is the specific variable name for the rerank model", + "source_ids": [ + 70 + ], + "id": "Name: r\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "db", + "entity_type": "DATASET_OR_CORPUS", + "description": "db is the specific variable name for the entity vector database", + "source_ids": [ + 70 + ], + "id": "Name: db\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "top k", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "top k is the specific variable name for the vector search number", + "source_ids": [ + 70 + ], + "id": "Name: top k\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "g", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "g is a parameter or variable representing the threshold of gradient, specifically used to check for score drops, and it serves as a component within the bookindex data structure, which contains relevant entities and is organized with fields such as b, t, g, and m.", + "source_ids": [ + 88, + 75, + 85, + 70 + ], + "id": "Name: g\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "vector search", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: vector search\nType: UNKNOWN" + }, + { + "entity_name": "db", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: db\nType: UNKNOWN" + }, + { + "entity_name": "search", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: search\nType: UNKNOWN" + }, + { + "entity_name": "r", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: r\nType: UNKNOWN" + }, + { + "entity_name": "e", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: e\nType: UNKNOWN" + }, + { + "entity_name": "sort", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: sort\nType: UNKNOWN" + }, + { + "entity_name": "gradient select", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: gradient select\nType: UNKNOWN" + }, + { + "entity_name": "top k", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: top k\nType: UNKNOWN" + }, + { + "entity_name": "v n", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: v n\nType: UNKNOWN" + }, + { + "entity_name": "e c", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: e c\nType: UNKNOWN" + }, + { + "entity_name": "v cn", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: v cn\nType: UNKNOWN" + }, + { + "entity_name": "s", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: s\nType: UNKNOWN" + }, + { + "entity_name": "c", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: c\nType: UNKNOWN" + }, + { + "entity_name": "score", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: score\nType: UNKNOWN" + }, + { + "entity_name": "s 0", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: s 0\nType: UNKNOWN" + }, + { + "entity_name": "sel", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ], + "id": "Name: sel\nType: UNKNOWN" + }, + { + "entity_name": "case a", + "entity_type": "TASK_OR_PROBLEM", + "description": "Case A is a scenario involving a new conceptual entity where the LLM helps differentiate it from a set identified by an algorithm. This situation arises when all candidates pass the gradient check, indicating that the scores lacked discriminative power.", + "source_ids": [ + 72, + 74, + 75 + ], + "id": "Name: case a\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "new entity", + "entity_type": "TASK_OR_PROBLEM", + "description": "A new entity is a conceptual entity, often recently extracted from text or representing a unique concept in a knowledge graph, that requires evaluation for relevance against existing entities. This task involves analyzing the entity's name, type, and description to determine if it can be matched with candidate entities, a process that demands strong evidence to confirm its distinctiveness and validity.", + "source_ids": [ + 72, + 265, + 274, + 262 + ], + "id": "Name: new entity\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "existing entities", + "entity_type": "TASK_OR_PROBLEM", + "description": "existing entities are the set of entities against which the relevance of a new conceptual entity is measured", + "source_ids": [ + 72 + ], + "id": "Name: existing entities\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "relevance scores", + "entity_type": "EVALUATION_METRIC", + "description": "relevance scores are the metrics used to measure the relationship between the new entity and existing entities", + "source_ids": [ + 72 + ], + "id": "Name: relevance scores\nType: EVALUATION_METRIC" + }, + { + "entity_name": "gradient", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "gradient refers to a mathematical pattern or value that is absent in the relevance scores for new entities", + "source_ids": [ + 72 + ], + "id": "Name: gradient\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "discriminative pattern", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "discriminative pattern refers to a distinguishing feature or trend that is not present in the relevance scores for new entities", + "source_ids": [ + 72 + ], + "id": "Name: discriminative pattern\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "case b", + "entity_type": "TASK_OR_PROBLEM", + "description": "Case b refers to a scenario involving an existing entity where an alias is being evaluated for relevance. It is characterized by a sharp decline that the gradient-based ER algorithm is designed to detect, occurring specifically when a gradient is found signaling a sharp score drop.", + "source_ids": [ + 73, + 74, + 75 + ], + "id": "Name: case b\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "reranker", + "entity_type": "TECHNOLOGY", + "description": "the reranker is a system or component described as having inherent discriminative limitations", + "source_ids": [ + 73 + ], + "id": "Name: reranker\nType: TECHNOLOGY" + }, + { + "entity_name": "existing entity", + "entity_type": "TASK_OR_PROBLEM", + "description": "existing entity refers to an entity that is already present in the system being discussed", + "source_ids": [ + 73 + ], + "id": "Name: existing entity\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "alias", + "entity_type": "CONCEPT", + "description": "An alias is a term used to describe an alternative name for an existing entity and is considered a valid form of similarity for entity names alongside direct abbreviations.", + "source_ids": [ + 73, + 267 + ], + "id": "Name: alias\nType: CONCEPT" + }, + { + "entity_name": "scores", + "entity_type": "EVALUATION_METRIC", + "description": "scores are the numerical values indicating the relevance of an alias to a true match", + "source_ids": [ + 73 + ], + "id": "Name: scores\nType: EVALUATION_METRIC" + }, + { + "entity_name": "true match", + "entity_type": "CONCEPT", + "description": "true match refers to the correct entity that an alias is being compared against", + "source_ids": [ + 73 + ], + "id": "Name: true match\nType: CONCEPT" + }, + { + "entity_name": "equivalent aliases", + "entity_type": "CONCEPT", + "description": "equivalent aliases refers to a small set of aliases that are considered the same as the true match", + "source_ids": [ + 73 + ], + "id": "Name: equivalent aliases\nType: CONCEPT" + }, + { + "entity_name": "gradient", + "entity_type": "MEASUREMENT", + "description": "gradient refers to the sharp decline in relevance scores mentioned in the text", + "source_ids": [ + 73 + ], + "id": "Name: gradient\nType: MEASUREMENT" + }, + { + "entity_name": "irrelevant entities", + "entity_type": "TASK_OR_PROBLEM", + "description": "irrelevant entities are the entities that follow the sharp decline in relevance scores", + "source_ids": [ + 73 + ], + "id": "Name: irrelevant entities\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "gradient based er algorithm", + "entity_type": "TECHNOLOGY", + "description": "the gradient based er algorithm is a method designed to detect sharp declines characteristic of case b and isolate high relevance sets", + "source_ids": [ + 74 + ], + "id": "Name: gradient based er algorithm\nType: TECHNOLOGY" + }, + { + "entity_name": "high relevance set", + "entity_type": "DATASET_OR_CORPUS", + "description": "the high relevance set is a collection of entities isolated by the gradient based er algorithm for further processing", + "source_ids": [ + 74 + ], + "id": "Name: high relevance set\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "similar entities", + "entity_type": "DATASET_OR_CORPUS", + "description": "similar entities are a group of items identified within the high relevance set that require finer grained distinction", + "source_ids": [ + 74 + ], + "id": "Name: similar entities\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "v n", + "entity_type": "TASK_OR_PROBLEM", + "description": "v n is a new entity being processed in the entity resolution process", + "source_ids": [ + 75 + ], + "id": "Name: v n\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "e c", + "entity_type": "TASK_OR_PROBLEM", + "description": "e c represents the top k candidates retrieved for the new entity v n", + "source_ids": [ + 75 + ], + "id": "Name: e c\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "db", + "entity_type": "TASK_OR_PROBLEM", + "description": "db is the vector database from which candidates are retrieved", + "source_ids": [ + 75 + ], + "id": "Name: db\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "r", + "entity_type": "TASK_OR_PROBLEM", + "description": "r is the reranker used to re rank candidates against v n", + "source_ids": [ + 75 + ], + "id": "Name: r\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "s", + "entity_type": "TASK_OR_PROBLEM", + "description": "s represents the scores assigned to the candidates", + "source_ids": [ + 75 + ], + "id": "Name: s\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "sel", + "entity_type": "TASK_OR_PROBLEM", + "description": "sel is the selection set initialized with the top scoring candidate", + "source_ids": [ + 75 + ], + "id": "Name: sel\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "v sel", + "entity_type": "TASK_OR_PROBLEM", + "description": "v sel is the canonical entity selected from the selection set sel", + "source_ids": [ + 75 + ], + "id": "Name: v sel\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "lines 1 3", + "entity_type": "SECTION_TITLE", + "description": "lines 1 3 describe the initial retrieval and reranking steps of the algorithm", + "source_ids": [ + 75 + ], + "id": "Name: lines 1 3\nType: SECTION_TITLE" + }, + { + "entity_name": "line 4", + "entity_type": "SECTION_TITLE", + "description": "line 4 describes the initialization of the selection set and the initial score", + "source_ids": [ + 75 + ], + "id": "Name: line 4\nType: SECTION_TITLE" + }, + { + "entity_name": "lines 5 8", + "entity_type": "SECTION_TITLE", + "description": "lines 5 8 describe the iteration through remaining candidates and the gradient threshold check", + "source_ids": [ + 75 + ], + "id": "Name: lines 5 8\nType: SECTION_TITLE" + }, + { + "entity_name": "lines 7 8", + "entity_type": "SECTION_TITLE", + "description": "lines 7 8 detail the logic for adding candidates to the selection set and updating scores", + "source_ids": [ + 75 + ], + "id": "Name: lines 7 8\nType: SECTION_TITLE" + }, + { + "entity_name": "line 8", + "entity_type": "SECTION_TITLE", + "description": "line 8 describes the condition where the loop breaks upon detecting a sharp score drop", + "source_ids": [ + 75 + ], + "id": "Name: line 8\nType: SECTION_TITLE" + }, + { + "entity_name": "lines 9 14", + "entity_type": "SECTION_TITLE", + "description": "lines 9 14 describe the final decision making logic of the algorithm", + "source_ids": [ + 75 + ], + "id": "Name: lines 9 14\nType: SECTION_TITLE" + }, + { + "entity_name": "line 9 10", + "entity_type": "SECTION_TITLE", + "description": "lines 9 10 describe the action taken in case a where a new entity is added", + "source_ids": [ + 75 + ], + "id": "Name: line 9 10\nType: SECTION_TITLE" + }, + { + "entity_name": "lines 12 14", + "entity_type": "SECTION_TITLE", + "description": "lines 12 14 describe the merging of the new entity with the canonical entity in case b", + "source_ids": [ + 75 + ], + "id": "Name: lines 12 14\nType: SECTION_TITLE" + }, + { + "entity_name": "line 13", + "entity_type": "SECTION_TITLE", + "description": "line 13 describes the use of an llm to select a canonical entity when multiple aliases exist", + "source_ids": [ + 75 + ], + "id": "Name: line 13\nType: SECTION_TITLE" + }, + { + "entity_name": "line 15", + "entity_type": "SECTION_TITLE", + "description": "line 15 describes the return of the updated g and db structures", + "source_ids": [ + 75 + ], + "id": "Name: line 15\nType: SECTION_TITLE" + }, + { + "entity_name": "score", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The score is a parameter or variable updated during iteration to track the current score value, and it also serves as the Y-axis label indicating the numerical value being measured in both charts.", + "source_ids": [ + 178, + 75 + ], + "id": "Name: score\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "v c", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "v c represents the current candidate being evaluated in the iteration", + "source_ids": [ + 75 + ], + "id": "Name: v c\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "e 9", + "entity_type": "TASK_OR_PROBLEM", + "description": "e 9 is a new entity being processed and compared against existing entities in the kg", + "source_ids": [ + 76 + ], + "id": "Name: e 9\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "e 6", + "entity_type": "TASK_OR_PROBLEM", + "description": "e 6 is an existing entity in the kg that shows a sharp decline in similarity with e 9", + "source_ids": [ + 76 + ], + "id": "Name: e 6\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "e 8", + "entity_type": "TASK_OR_PROBLEM", + "description": "e 8 is an existing entity in the kg that shows a sharp decline in similarity with e 9", + "source_ids": [ + 76 + ], + "id": "Name: e 8\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "e 5", + "entity_type": "TASK_OR_PROBLEM", + "description": "e 5 is an existing entity in the kg that shows a sharp decline in similarity with e 9", + "source_ids": [ + 76 + ], + "id": "Name: e 5\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "e 7", + "entity_type": "TASK_OR_PROBLEM", + "description": "e 7 is the final merged entity resulting from the consolidation of e 9 and e 7", + "source_ids": [ + 76 + ], + "id": "Name: e 7\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "similarity curve", + "entity_type": "IMAGE", + "description": "the similarity curve is a visual depiction orange line showing the similarity levels between entities", + "source_ids": [ + 76 + ], + "id": "Name: similarity curve\nType: IMAGE" + }, + { + "entity_name": "gradient based selection process", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the gradient based selection process is the method used to identify high confidence matches between entities", + "source_ids": [ + 76 + ], + "id": "Name: gradient based selection process\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "orange line", + "entity_type": "IMAGE", + "description": "the orange line is a specific visual element within the similarity curve mentioned in the text", + "source_ids": [ + 76 + ], + "id": "Name: orange line\nType: IMAGE" + }, + { + "entity_name": "unique high confidence match", + "entity_type": "CONCEPT", + "description": "a unique high confidence match is the result of the gradient based selection process identifying e 7 for e 9", + "source_ids": [ + 76 + ], + "id": "Name: unique high confidence match\nType: CONCEPT" + }, + { + "entity_name": "consolidated information", + "entity_type": "CONCEPT", + "description": "consolidated information refers to the enriched data resulting from merging entities in the kg", + "source_ids": [ + 76 + ], + "id": "Name: consolidated information\nType: CONCEPT" + }, + { + "entity_name": "kg construction phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "kg construction phase is a specific stage described in the text where origin tree nodes are recorded for extracted entities", + "source_ids": [ + 77 + ], + "id": "Name: kg construction phase\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "origin tree node", + "entity_type": "HARDWARE", + "description": "origin tree node is a structural location recorded for every newly extracted entity", + "source_ids": [ + 77 + ], + "id": "Name: origin tree node\nType: HARDWARE" + }, + { + "entity_name": "canonical entity", + "entity_type": "CONCEPT", + "description": "canonical entity is the target of merging during entity resolution receiving updated origin node sets", + "source_ids": [ + 77 + ], + "id": "Name: canonical entity\nType: CONCEPT" + }, + { + "entity_name": "g", + "entity_type": "CONCEPT", + "description": "g is a component of the bookindex structure b", + "source_ids": [ + 77 + ], + "id": "Name: g\nType: CONCEPT" + }, + { + "entity_name": "t", + "entity_type": "CONCEPT", + "description": "t is a component of the bookindex structure b and represents the set of structural locations nodes", + "source_ids": [ + 77 + ], + "id": "Name: t\nType: CONCEPT" + }, + { + "entity_name": "v i", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "v i represents a newly extracted entity for which an origin tree node is recorded", + "source_ids": [ + 77 + ], + "id": "Name: v i\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "v n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "v n represents an entity that is merged into a canonical entity during entity resolution", + "source_ids": [ + 77 + ], + "id": "Name: v n\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "v sel", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "v sel represents the canonical entity into which v n is merged", + "source_ids": [ + 77 + ], + "id": "Name: v sel\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "p n", + "entity_type": "MATHEMATICAL_CONCEPT", + "description": "p n represents the power set of nodes n used in the definition of the mapping m", + "source_ids": [ + 77 + ], + "id": "Name: p n\nType: MATHEMATICAL_CONCEPT" + }, + { + "entity_name": "m", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 77 + ], + "id": "Name: m\nType: UNKNOWN" + }, + { + "entity_name": "5 agent-based retrieval", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the main paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section details the proposed agent-based query method inspired by Information Foraging Theory, which dynamically classifies queries and employs a tailored retrieval workflow.", + "source_ids": [ + 78 + ], + "id": "Name: 5 agent-based retrieval\nType: SECTION_TITLE" + }, + { + "entity_name": "agent-based query method", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Refers to the specific retrieval strategy introduced in section 5, which utilizes agents to dynamically classify queries based on Information Foraging Theory.", + "source_ids": [ + 78 + ], + "id": "Name: agent-based query method\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "bookindex", + "entity_type": "DATABASE", + "description": "bookindex is the data structure or system on which bookrag executes operations for document queries", + "source_ids": [ + 79 + ], + "id": "Name: bookindex\nType: DATABASE" + }, + { + "entity_name": "agent based planning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Agent based planning is a method used for planning that serves as a core mechanism in Bookrag to formulate strategies for operations and acts as a component of the workflow that classifies queries. It is also a mechanism assessed for its necessity in the system's performance, and its removal in a specific scenario leads to the adoption of a default workflow.", + "source_ids": [ + 166, + 172, + 79, + 157, + 93 + ], + "id": "Name: agent based planning\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "structured execution", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "structured execution is a core mechanism in bookrag that includes the retrieval process based on ift and generation principles", + "source_ids": [ + 79 + ], + "id": "Name: structured execution\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "modal type filtering", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "modal type filtering is an operation mentioned as necessary for addressing complex real world document queries", + "source_ids": [ + 79 + ], + "id": "Name: modal type filtering\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "semantic selection", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "semantic selection is an operation mentioned as necessary for addressing complex real world document queries", + "source_ids": [ + 79 + ], + "id": "Name: semantic selection\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "multi hop reasoning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "multi hop reasoning is an operation mentioned as necessary for addressing complex real world document queries", + "source_ids": [ + 79 + ], + "id": "Name: multi hop reasoning\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "generation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "generation is a process included within the structured execution mechanism of bookrag", + "source_ids": [ + 79 + ], + "id": "Name: generation\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "real world document queries", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 79 + ], + "id": "Name: real world document queries\nType: UNKNOWN" + }, + { + "entity_name": "5.1 overall workflow", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Agent-Based Retrieval', this section introduces the general operational flow of the BookRAG system, outlining how it intelligently plans and executes operations on the BookIndex to handle complex document queries.", + "source_ids": [ + 80 + ], + "id": "Name: 5.1 overall workflow\nType: SECTION_TITLE" + }, + { + "entity_name": "figure 3", + "entity_type": "IMAGE", + "description": "Figure 3 is an illustration depicting the general workflow of agent-based retrieval in BookRag.", + "source_ids": [ + 81, + 83 + ], + "id": "Name: figure 3\nType: IMAGE" + }, + { + "entity_name": "three stage pipeline", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the three stage pipeline is the structure of the workflow used to address users queries", + "source_ids": [ + 81 + ], + "id": "Name: three stage pipeline\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "agent based planning", + "entity_type": "TASK_OR_PROBLEM", + "description": "Agent based planning is a process component within the agent based retrieval workflow of BookRag, serving as a stage that involves classification and planning for queries.", + "source_ids": [ + 82, + 83 + ], + "id": "Name: agent based planning\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "classification plan", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "classification plan is the specific stage within agent based planning aimed at distinguishing query types", + "source_ids": [ + 82 + ], + "id": "Name: classification plan\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "transformer", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "transformer is a model architecture mentioned as an example in a query regarding long range dependencies", + "source_ids": [ + 82 + ], + "id": "Name: transformer\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "rnns", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "rnns are model architectures mentioned as an example in a query regarding long range dependencies", + "source_ids": [ + 82 + ], + "id": "Name: rnns\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "bookindex", + "entity_type": "DATASET_OR_CORPUS", + "description": "Bookindex is a dataset or corpus that serves as a predefined set of operators used to generate plans for retrieval and generation strategies. It functions as a data structure, represented as b t g m, which is navigated during the retrieval process, including by the system known as bookrag.", + "source_ids": [ + 88, + 82, + 85 + ], + "id": "Name: bookindex\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "operators plan", + "entity_type": "TASK_OR_PROBLEM", + "description": "an operators plan is generated to guide retrieval and generation strategies", + "source_ids": [ + 82 + ], + "id": "Name: operators plan\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "Retrieval is a task or problem area where errors are identified as the dominant failure mode. It functions as a strategy guided by an operator's plan and serves as a process component within the agent-based retrieval workflow of BookRag. In the context of graph-based RAG methods, retrieval is the process performed after extracting textual content, while in other contexts, it refers to the effectiveness of retrieving information, which is evaluated. Additionally, retrieval involves the process of finding evidence, a task that is improved by layout parsing.", + "source_ids": [ + 82, + 83, + 147, + 151, + 152, + 185 + ], + "id": "Name: retrieval\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "Generation is a strategy guided by the operators plan and serves as a process component within the agent-based retrieval workflow of Bookrag. It is the generative component integrated into the structured execution workflow, where both selection paths proceed to this stage. As a task or problem area, generation involves the process of creating output, which is made accurate by Bookrag, and it is notable as the second most common failure mode where errors are identified.", + "source_ids": [ + 82, + 115, + 83, + 152, + 185, + 123 + ], + "id": "Name: generation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "agent based retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "agent based retrieval is a workflow containing planning retrieval and generation processes used in bookrag", + "source_ids": [ + 83 + ], + "id": "Name: agent based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "workflow", + "entity_type": "TASK_OR_PROBLEM", + "description": "Workflow refers to the general process flow of agent-based retrieval in BookRag and is defined as the generated sequence of operations executed by BookRag.", + "source_ids": [ + 83, + 124 + ], + "id": "Name: workflow\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "planning", + "entity_type": "TASK_OR_PROBLEM", + "description": "Planning is a specific step within the agent-based retrieval process where a plan is formulated to solve the query, and it represents the task or problem component that is removed in the described scenario.", + "source_ids": [ + 83, + 166, + 94 + ], + "id": "Name: planning\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "generation processes", + "entity_type": "TASK_OR_PROBLEM", + "description": "generation processes are a component of the agent based retrieval workflow in bookrag", + "source_ids": [ + 83 + ], + "id": "Name: generation processes\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "cref='#/texts/89'", + "entity_type": "IMAGE", + "description": "A flowchart diagram illustrating a three-stage process involving planning, retrieval, and generation to answer a question.", + "source_ids": [ + 84 + ], + "id": "Name: cref='#/texts/89'\nType: IMAGE" + }, + { + "entity_name": "question", + "entity_type": "TASK_OR_PROBLEM", + "description": "A question serves as the input trigger for the system, often represented by an icon of a person with a question mark. It is recognized as a source of key entities used to identify information scents and functions as the item that needs to be answered in a single hop task.", + "source_ids": [ + 243, + 84, + 125 + ], + "id": "Name: question\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "agent-based planning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Agent-based planning is a methodology where an agent formulates strategies to handle complex retrieval tasks involving modal filtering and multi-hop reasoning. It serves as the first stage of a process that manages classification and planning tasks, functioning as a strategy where operators are selected to decompose or handle specific queries.", + "source_ids": [ + 84, + 182, + 87 + ], + "id": "Name: agent-based planning\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "retrieval process", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The retrieval process is the second stage of the overall procedure, which employs scent or filter-based mechanisms to retrieve information. It specifically refers to the mechanism for retrieving information from the BookIndex as described in section 5.3.", + "source_ids": [ + 123, + 84 + ], + "id": "Name: retrieval process\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "generation process", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The third stage of the process, responsible for analysis and merging data to form the output.", + "source_ids": [ + 84 + ], + "id": "Name: generation process\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "retrieval process", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval process is a stage guided by an operator plan that executes scent filter based retrieval", + "source_ids": [ + 85 + ], + "id": "Name: retrieval process\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "scent filter based retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "scent filter based retrieval is the specific method executed during the retrieval process to find information", + "source_ids": [ + 85 + ], + "id": "Name: scent filter based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "t", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "t is a component of the bookindex data structure, which includes the elements b, t, g, and m.", + "source_ids": [ + 88, + 85 + ], + "id": "Name: t\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "operator plan", + "entity_type": "TASK_OR_PROBLEM", + "description": "An operator plan is the guiding document or set of instructions for the retrieval process, representing the final task of an agent to generate an executable plan after classifying a query. It consists of the specific sequence of operators chosen to solve the problem, such as Extract, Select, Reason, Skyline, and Map.", + "source_ids": [ + 112, + 85, + 94 + ], + "id": "Name: operator plan\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "modal type", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "modal type is a specific filter used to refine the selection of information during retrieval", + "source_ids": [ + 85 + ], + "id": "Name: modal type\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "relevant entities", + "entity_type": "DATASET_OR_CORPUS", + "description": "relevant entities are the items found in g that are followed during scent based retrieval", + "source_ids": [ + 85 + ], + "id": "Name: relevant entities\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "information blocks", + "entity_type": "DATASET_OR_CORPUS", + "description": "information blocks are the highly relevant units of data retrieved by bookrag", + "source_ids": [ + 85 + ], + "id": "Name: information blocks\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "generation process", + "entity_type": "TASK_OR_PROBLEM", + "description": "generation process is the final stage where retrieved information is synthesized and analyzed to formulate a coherent response", + "source_ids": [ + 86 + ], + "id": "Name: generation process\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "analysis merging", + "entity_type": "TASK_OR_PROBLEM", + "description": "analysis merging is the specific activity within the generation stage that synthesizes fragmented evidence", + "source_ids": [ + 86 + ], + "id": "Name: analysis merging\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "retrieved information", + "entity_type": "DATASET_OR_CORPUS", + "description": "retrieved information refers to the data collected and brought into the generation stage for processing", + "source_ids": [ + 86 + ], + "id": "Name: retrieved information\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "fragmented pieces of evidence", + "entity_type": "DATASET_OR_CORPUS", + "description": "fragmented pieces of evidence are the specific incomplete data items that are synthesized during the process", + "source_ids": [ + 86 + ], + "id": "Name: fragmented pieces of evidence\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "coherent response", + "entity_type": "PRODUCT", + "description": "coherent response is the final output formulated by the generation stage after analysis", + "source_ids": [ + 86 + ], + "id": "Name: coherent response\nType: PRODUCT" + }, + { + "entity_name": "5.2 agent-based planning", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Agent-Based Retrieval' (Section 5), this section details the strategy formulation mechanism within the BookRAG framework, explaining how an agent intelligently plans operations for complex document queries.", + "source_ids": [ + 87 + ], + "id": "Name: 5.2 agent-based planning\nType: SECTION_TITLE" + }, + { + "entity_name": "formulator", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The formulator is one of four types of operators defined to support flexible retrieval in BookRag and is one of the four operator types depicted in the BookRag operator library.", + "source_ids": [ + 88, + 93 + ], + "id": "Name: formulator\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "selector", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "Selector is one of four types of operators defined to support flexible retrieval in BookRag and is one of the four operator types depicted in the BookRag operator library.", + "source_ids": [ + 88, + 93 + ], + "id": "Name: selector\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "reasoner", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "Reasoner is one of the four operator types defined in the BookRag operator library to support flexible retrieval.", + "source_ids": [ + 88, + 93 + ], + "id": "Name: reasoner\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "synthesizer", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The synthesizer is one of four operator types defined within the BookRag operator library to support flexible retrieval.", + "source_ids": [ + 88, + 93 + ], + "id": "Name: synthesizer\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "agent", + "entity_type": "TASK_OR_PROBLEM", + "description": "The agent is an entity that performs the first step of the sequential process in BookRAG and employs operators for diverse query categories.", + "source_ids": [ + 88, + 97 + ], + "id": "Name: agent\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "query categories", + "entity_type": "TASK_OR_PROBLEM", + "description": "Query categories are specific requirements that BookRags adapts to using its operators, representing the diverse groups of queries for which the agent employs these operators.", + "source_ids": [ + 88, + 97 + ], + "id": "Name: query categories\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "execution pipelines", + "entity_type": "TASK_OR_PROBLEM", + "description": "execution pipelines are formed by combining operators to support flexible retrieval", + "source_ids": [ + 88 + ], + "id": "Name: execution pipelines\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "adjustable parameters", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "adjustable parameters are attributes of the execution pipelines that can be configured", + "source_ids": [ + 88 + ], + "id": "Name: adjustable parameters\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "table 2", + "entity_type": "TABLE", + "description": "Table 2 is a reference in the text that lists and defines three common query categories addressed in BookRags.", + "source_ids": [ + 96, + 89 + ], + "id": "Name: table 2\nType: TABLE" + }, + { + "entity_name": "table: cref='#/texts/95'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/95'", + "source_ids": [ + 90 + ], + "id": "Name: table: cref='#/texts/95'...\nType: TABLE" + }, + { + "entity_name": "6", + "entity_type": "MEASUREMENT", + "description": "6 is a numerical value mentioned in the text, potentially representing a count or measurement, and specifically serves as the issue number of the journal ACM Computing Surveys where the paper was published.", + "source_ids": [ + 202, + 91 + ], + "id": "Name: 6\nType: MEASUREMENT" + }, + { + "entity_name": "operator set", + "entity_type": "TASK_OR_PROBLEM", + "description": "operator set is a task or problem mentioned in the text likely referring to a specific set of operators in a technical context", + "source_ids": [ + 92 + ], + "id": "Name: operator set\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "figure 4", + "entity_type": "IMAGE", + "description": "Figure 4 is a visual element referenced in the text, specifically part A, that depicts the BookRAG operator library along with an execution example.", + "source_ids": [ + 97, + 93 + ], + "id": "Name: figure 4\nType: IMAGE" + }, + { + "entity_name": "bookrag operator library", + "entity_type": "SOFTWARE", + "description": "the bookrag operator library is a software component containing four operator types", + "source_ids": [ + 93 + ], + "id": "Name: bookrag operator library\nType: SOFTWARE" + }, + { + "entity_name": "mmlongbench dataset", + "entity_type": "DATASET_OR_CORPUS", + "description": "the mmlongbench dataset is the source of the execution example shown in the text", + "source_ids": [ + 93 + ], + "id": "Name: mmlongbench dataset\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "single hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "Single hop is a type of query and task where an agent first attempts to extract an entity, often used to evaluate the performance of systems like BookRag and in QA performance breakdowns. It refers to a specific query case where the reasoning space is significantly reduced, such as from 134 to 24 nodes, and is characterized by the ability to answer a question by retrieving information from a single location, with execution traces often demonstrated to illustrate its mechanics.", + "source_ids": [ + 135, + 177, + 115, + 243, + 179, + 186, + 93 + ], + "id": "Name: single hop\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "operator", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "operator is a general term for the components formulator selector reasoner synthesizer within the bookrag system", + "source_ids": [ + 93 + ], + "id": "Name: operator\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "execution trace", + "entity_type": "TASK_OR_PROBLEM", + "description": "execution trace is the step by step record of the agent based planning and operator execution shown in the text", + "source_ids": [ + 93 + ], + "id": "Name: execution trace\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "step by step operator execution", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "step by step operator execution is the method of executing operators demonstrated in the text", + "source_ids": [ + 93 + ], + "id": "Name: step by step operator execution\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "operator-set", + "entity_type": "IMAGE", + "description": "A diagram illustrating a framework for processing queries, divided into an 'Operators' section and an 'Execution example' section.", + "source_ids": [ + 94 + ], + "id": "Name: operator-set\nType: IMAGE" + }, + { + "entity_name": "extract", + "entity_type": "TASK_OR_PROBLEM", + "description": "The initial step in the operator set where questions are decomposed to identify entities.", + "source_ids": [ + 94 + ], + "id": "Name: extract\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "decompose", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Decompose is a method and specific operator used within the Extract phase that breaks down a complex query into simpler, actionable sub-queries, a technique leveraged by BookRag to prune search spaces.", + "source_ids": [ + 186, + 94, + 98 + ], + "id": "Name: decompose\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "entities", + "entity_type": "DATASET_OR_CORPUS", + "description": "The output of the Extract phase, representing distinct items identified from the input text.", + "source_ids": [ + 94 + ], + "id": "Name: entities\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "sub-queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "Smaller queries generated during the decomposition process.", + "source_ids": [ + 94 + ], + "id": "Name: sub-queries\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "formulator", + "entity_type": "SYSTEM_COMPONENT", + "description": "The component or agent responsible for the extraction and decomposition steps.", + "source_ids": [ + 94 + ], + "id": "Name: formulator\nType: SYSTEM_COMPONENT" + }, + { + "entity_name": "filter", + "entity_type": "TASK_OR_PROBLEM", + "description": "An operator that processes data structures like trees to select relevant information.", + "source_ids": [ + 94 + ], + "id": "Name: filter\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "select", + "entity_type": "TASK_OR_PROBLEM", + "description": "The action performed by the Filter operator to choose specific elements.", + "source_ids": [ + 94 + ], + "id": "Name: select\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "selector", + "entity_type": "SYSTEM_COMPONENT", + "description": "The component responsible for filtering and selecting data based on criteria.", + "source_ids": [ + 94 + ], + "id": "Name: selector\nType: SYSTEM_COMPONENT" + }, + { + "entity_name": "reason", + "entity_type": "TASK_OR_PROBLEM", + "description": "An operator that takes Graph and Text inputs to perform reasoning tasks.", + "source_ids": [ + 94 + ], + "id": "Name: reason\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "graph", + "entity_type": "DATA_STRUCTURE", + "description": "A visual representation of data used as input for the Reason operator.", + "source_ids": [ + 94 + ], + "id": "Name: graph\nType: DATA_STRUCTURE" + }, + { + "entity_name": "text", + "entity_type": "DATA_STRUCTURE", + "description": "Raw textual data used as input for the Reason operator.", + "source_ids": [ + 94 + ], + "id": "Name: text\nType: DATA_STRUCTURE" + }, + { + "entity_name": "s:", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A label indicating a score or similarity matrix with values such as 0.6, 0.5, 0.4.", + "source_ids": [ + 94 + ], + "id": "Name: s:\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "skyline", + "entity_type": "TASK_OR_PROBLEM", + "description": "An operator that processes ranked lists (S1, S2) to find optimal solutions.", + "source_ids": [ + 94 + ], + "id": "Name: skyline\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "reasoner", + "entity_type": "SYSTEM_COMPONENT", + "description": "The component executing the Reason and Skyline operations.", + "source_ids": [ + 94 + ], + "id": "Name: reasoner\nType: SYSTEM_COMPONENT" + }, + { + "entity_name": "map", + "entity_type": "TASK_OR_PROBLEM", + "description": "The map is an operator that transforms data using icons representing different formats and performs analysis on specific retrieved information segments to generate partial responses.", + "source_ids": [ + 94, + 111 + ], + "id": "Name: map\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "reduce", + "entity_type": "TASK_OR_PROBLEM", + "description": "Reduce is an operator that combines multiple inputs into a single result by synthesizing a final coherent answer through the aggregation of information from various sources.", + "source_ids": [ + 94, + 111 + ], + "id": "Name: reduce\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "synthesizer", + "entity_type": "SYSTEM_COMPONENT", + "description": "The final component that aggregates results into a coherent answer.", + "source_ids": [ + 94 + ], + "id": "Name: synthesizer\nType: SYSTEM_COMPONENT" + }, + { + "entity_name": "execution example", + "entity_type": "SECTION_TITLE", + "description": "A subsection of the diagram showing a concrete application of the operator set.", + "source_ids": [ + 94 + ], + "id": "Name: execution example\nType: SECTION_TITLE" + }, + { + "entity_name": "q: what is the type of car in the ranking prompt example?", + "entity_type": "TASK_OR_PROBLEM", + "description": "The specific user question being processed in the execution example.", + "source_ids": [ + 94 + ], + "id": "Name: q: what is the type of car in the ranking prompt example?\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "simple query...", + "entity_type": "TASK_OR_PROBLEM", + "description": "A classification of the input query.", + "source_ids": [ + 94 + ], + "id": "Name: simple query...\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "car", + "entity_type": "PRODUCT", + "description": "A car is a key entity identified in the query regarding the type of car in the ranking prompt example.", + "source_ids": [ + 94, + 135 + ], + "id": "Name: car\nType: PRODUCT" + }, + { + "entity_name": "ranking prompt", + "entity_type": "BOOK", + "description": "An entity mentioned in the question context.", + "source_ids": [ + 94 + ], + "id": "Name: ranking prompt\nType: BOOK" + }, + { + "entity_name": "method", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A node in the planning graph representing the method to be used.", + "source_ids": [ + 94 + ], + "id": "Name: method\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "method and its descendants", + "entity_type": "SECTION_TITLE", + "description": "A grouping of nodes related to the Method in the execution flow.", + "source_ids": [ + 94 + ], + "id": "Name: method and its descendants\nType: SECTION_TITLE" + }, + { + "entity_name": "a: based on the provided information...", + "entity_type": "TASK_OR_PROBLEM", + "description": "The final answer generated by the system.", + "source_ids": [ + 94 + ], + "id": "Name: a: based on the provided information...\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "mercedes-benz e-class sedan", + "entity_type": "VEHICLE", + "description": "The specific car type identified as the correct answer in the example.", + "source_ids": [ + 94 + ], + "id": "Name: mercedes-benz e-class sedan\nType: VEHICLE" + }, + { + "entity_name": "image cref='#/texts/98'", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 94 + ], + "id": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "entity_name": "query classification", + "entity_type": "TASK_OR_PROBLEM", + "description": "Query classification is a task designed to determine the appropriate solution strategy by categorizing queries based on their complexity, thereby enabling effective agent strategy selection, and it is the specific problem for which the prompt in figure 10 is intended.", + "source_ids": [ + 96, + 253, + 95 + ], + "id": "Name: query classification\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "operator plan", + "entity_type": "PRODUCT", + "description": "operator plan is a specific output generated after determining the solution strategy", + "source_ids": [ + 95 + ], + "id": "Name: operator plan\nType: PRODUCT" + }, + { + "entity_name": "single hop", + "entity_type": "EVENT", + "description": "single hop is a query category requiring a single piece of information retrieved via a scent based retrieval operation", + "source_ids": [ + 96 + ], + "id": "Name: single hop\nType: EVENT" + }, + { + "entity_name": "multi hop", + "entity_type": "EVENT", + "description": "multi hop is a query category defined by its intrinsic complexity and operational demands", + "source_ids": [ + 96 + ], + "id": "Name: multi hop\nType: EVENT" + }, + { + "entity_name": "global aggregation", + "entity_type": "EVENT", + "description": "global aggregation is a query category necessitating analysis under multiple filtering conditions", + "source_ids": [ + 96 + ], + "id": "Name: global aggregation\nType: EVENT" + }, + { + "entity_name": "scent based retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "scent based retrieval is a method used to retrieve a single piece of information for single hop queries", + "source_ids": [ + 96 + ], + "id": "Name: scent based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "filter aggregation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "filter aggregation is a sequence of operations used to analyze content under multiple filtering conditions for global aggregation queries", + "source_ids": [ + 96 + ], + "id": "Name: filter aggregation\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "agent strategy selection", + "entity_type": "TASK_OR_PROBLEM", + "description": "agent strategy selection is a process enabled by query classification to determine the appropriate solution strategy", + "source_ids": [ + 96 + ], + "id": "Name: agent strategy selection\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "intrinsic complexity", + "entity_type": "CONCEPT", + "description": "intrinsic complexity is an attribute used to define the query categories", + "source_ids": [ + 96 + ], + "id": "Name: intrinsic complexity\nType: CONCEPT" + }, + { + "entity_name": "operational demands", + "entity_type": "CONCEPT", + "description": "operational demands are factors used to define the query categories alongside intrinsic complexity", + "source_ids": [ + 96 + ], + "id": "Name: operational demands\nType: CONCEPT" + }, + { + "entity_name": "solution strategy", + "entity_type": "CONCEPT", + "description": "solution strategy refers to the different approaches required for each query category", + "source_ids": [ + 96 + ], + "id": "Name: solution strategy\nType: CONCEPT" + }, + { + "entity_name": "filtering conditions", + "entity_type": "CONCEPT", + "description": "filtering conditions are multiple criteria used in the analysis of global aggregation queries", + "source_ids": [ + 96 + ], + "id": "Name: filtering conditions\nType: CONCEPT" + }, + { + "entity_name": "document", + "entity_type": "OBJECT", + "description": "document refers to the source material where content is analyzed during global aggregation queries", + "source_ids": [ + 96 + ], + "id": "Name: document\nType: OBJECT" + }, + { + "entity_name": "additional operators", + "entity_type": "SOFTWARE", + "description": "additional operators are components integrated into bookrag to extend its capabilities", + "source_ids": [ + 96 + ], + "id": "Name: additional operators\nType: SOFTWARE" + }, + { + "entity_name": "bookindex operators", + "entity_type": "TASK_OR_PROBLEM", + "description": "bookindex operators are a set of strategies designed to execute tasks identified by classification", + "source_ids": [ + 97 + ], + "id": "Name: bookindex operators\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "o", + "entity_type": "TASK_OR_PROBLEM", + "description": "o represents the set of operators tailored for the bookindex", + "source_ids": [ + 97 + ], + "id": "Name: o\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "figure 4 a", + "entity_type": "IMAGE", + "description": "figure 4 a is a visual depiction of the operators", + "source_ids": [ + 97 + ], + "id": "Name: figure 4 a\nType: IMAGE" + }, + { + "entity_name": "table 3", + "entity_type": "TABLE", + "description": "Table 3 provides detailed information about the operators utilized in BookRAG by categorizing them according to their function.", + "source_ids": [ + 97, + 131 + ], + "id": "Name: table 3\nType: TABLE" + }, + { + "entity_name": "classification", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "classification is the method used to identify the strategies executed by the operators", + "source_ids": [ + 97 + ], + "id": "Name: classification\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "formulator", + "entity_type": "TASK_OR_PROBLEM", + "description": "formulator is a category of llm based operators that prepare queries for execution", + "source_ids": [ + 98 + ], + "id": "Name: formulator\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "extract", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Extract is a method that employs a large language model to identify key entities from query text and link them to a knowledge graph, such as identifying entities like \"car\".", + "source_ids": [ + 98, + 135 + ], + "id": "Name: extract\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "qs", + "entity_type": "TASK_OR_PROBLEM", + "description": "qs represents the set of simpler actionable sub queries generated by the decompose method", + "source_ids": [ + 98 + ], + "id": "Name: qs\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "eq", + "entity_type": "TASK_OR_PROBLEM", + "description": "eq represents the set of key entities identified by the extract method", + "source_ids": [ + 98 + ], + "id": "Name: eq\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "pdec", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "pdec is a parameter used in the llm function to generate sub queries", + "source_ids": [ + 98 + ], + "id": "Name: pdec\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "pext", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "pext is a parameter used in the llm function to identify key entities", + "source_ids": [ + 98 + ], + "id": "Name: pext\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "sub queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "sub queries are the simpler actionable components resulting from breaking down a complex query", + "source_ids": [ + 98 + ], + "id": "Name: sub queries\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "query text", + "entity_type": "TASK_OR_PROBLEM", + "description": "query text is the source material from which the extract method identifies key entities", + "source_ids": [ + 98 + ], + "id": "Name: query text\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "entities", + "entity_type": "TASK_OR_PROBLEM", + "description": "entities are the key items identified in the query text and linked to the knowledge graph", + "source_ids": [ + 98 + ], + "id": "Name: entities\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "formula (2)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the output Q(s) as a set of query vectors generated by an LLM. LaTeX: 𝑄 𝑠 = LLM ( 𝑃 𝐷𝑒𝑐 , 𝑞 ) = { 𝑞 , 𝑞 1 2 , . . . , 𝑞 𝑘 } (2)", + "source_ids": [ + 99 + ], + "id": "Name: formula (2)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "formula (3)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the output of an LLM function as a set of elements. LaTeX: 𝐸 𝑞 = LLM ( 𝑃 𝐸𝑥𝑡 , 𝑞 ) = { 𝑒 1 , 𝑒 2 , . . . , 𝑒 𝑚 } (3)", + "source_ids": [ + 100 + ], + "id": "Name: formula (3)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "p dec", + "entity_type": "SOFTWARE", + "description": "p dec represents a prompt used to guide the llm for the decomposition task", + "source_ids": [ + 101 + ], + "id": "Name: p dec\nType: SOFTWARE" + }, + { + "entity_name": "p ext", + "entity_type": "SOFTWARE", + "description": "p ext represents a prompt used to guide the llm for the extraction task", + "source_ids": [ + 101 + ], + "id": "Name: p ext\nType: SOFTWARE" + }, + { + "entity_name": "decomposition", + "entity_type": "TASK_OR_PROBLEM", + "description": "decomposition is a task for which the prompt p dec is used to guide the llm", + "source_ids": [ + 101 + ], + "id": "Name: decomposition\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "extraction is a task for which the prompt p ext is used to guide the llm", + "source_ids": [ + 101 + ], + "id": "Name: extraction\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "prompt", + "entity_type": "SOFTWARE", + "description": "A prompt is a software component consisting of instructions used to guide large language models for specific tasks and is also utilized for entity resolution judgement.", + "source_ids": [ + 284, + 101 + ], + "id": "Name: prompt\nType: SOFTWARE" + }, + { + "entity_name": "selector", + "entity_type": "TECHNOLOGY", + "description": "A selector is an operator or component used to filter or select specific content ranges from a bookindex, and it can also be removed to force reasoners to score all candidate nodes.", + "source_ids": [ + 102, + 167 + ], + "id": "Name: selector\nType: TECHNOLOGY" + }, + { + "entity_name": "filter modal", + "entity_type": "TECHNOLOGY", + "description": "filter modal is an operator that applies explicit constraints to the bookindex", + "source_ids": [ + 102 + ], + "id": "Name: filter modal\nType: TECHNOLOGY" + }, + { + "entity_name": "filter range", + "entity_type": "TECHNOLOGY", + "description": "filter range is an operator that applies explicit constraints to the bookindex", + "source_ids": [ + 102 + ], + "id": "Name: filter range\nType: TECHNOLOGY" + }, + { + "entity_name": "n f", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "n f is the filtered subset of nodes produced by the operators", + "source_ids": [ + 102 + ], + "id": "Name: n f\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "c n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "c n is a predicate that holds true for each node in the filtered subset", + "source_ids": [ + 102 + ], + "id": "Name: c n\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "modal types", + "entity_type": "CONCEPT", + "description": "modal types are a specific type of explicit constraint c mentioned in the text", + "source_ids": [ + 102 + ], + "id": "Name: modal types\nType: CONCEPT" + }, + { + "entity_name": "page ranges", + "entity_type": "CONCEPT", + "description": "page ranges are a specific type of explicit constraint c mentioned in the text", + "source_ids": [ + 102 + ], + "id": "Name: page ranges\nType: CONCEPT" + }, + { + "entity_name": "plan", + "entity_type": "TASK_OR_PROBLEM", + "description": "Plan refers to the planning aspect of the process where errors are analyzed and during which explicit constraints are generated.", + "source_ids": [ + 185, + 102 + ], + "id": "Name: plan\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "nodes", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "nodes are the individual elements within the tree t that are evaluated by the predicate", + "source_ids": [ + 102 + ], + "id": "Name: nodes\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "edges", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "edges are the connections within the tree t denoted as e t", + "source_ids": [ + 102 + ], + "id": "Name: edges\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "formula (4)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the set N_f as a subset of N based on condition C. LaTeX: 𝑁 𝑓 = { 𝑛 ∈ 𝑁 | 𝐶 𝑛 ( )} (4)", + "source_ids": [ + 103 + ], + "id": "Name: formula (4)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "select by entity", + "entity_type": "TECHNOLOGY", + "description": "select by entity is a method that targets contiguous document segments by retrieving subtrees rooted at specific section nodes", + "source_ids": [ + 104 + ], + "id": "Name: select by entity\nType: TECHNOLOGY" + }, + { + "entity_name": "select by section", + "entity_type": "TECHNOLOGY", + "description": "select by section is a method that targets contiguous document segments by retrieving subtrees rooted at specific section nodes", + "source_ids": [ + 104 + ], + "id": "Name: select by section\nType: TECHNOLOGY" + }, + { + "entity_name": "s target", + "entity_type": "TASK_OR_PROBLEM", + "description": "s target represents a set of target section nodes at a specified depth", + "source_ids": [ + 104 + ], + "id": "Name: s target\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "e q", + "entity_type": "TASK_OR_PROBLEM", + "description": "e q represents the entities linked to sections via gt link", + "source_ids": [ + 104 + ], + "id": "Name: e q\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "n s", + "entity_type": "TASK_OR_PROBLEM", + "description": "n s represents the selected node set formed by retrieving descendants of target sections", + "source_ids": [ + 104 + ], + "id": "Name: n s\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "subtree", + "entity_type": "TASK_OR_PROBLEM", + "description": "subtree refers to the data structure rooted at specific section nodes that is retrieved by the methods", + "source_ids": [ + 104 + ], + "id": "Name: subtree\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "section node", + "entity_type": "TASK_OR_PROBLEM", + "description": "section node is a specific node within the document structure that serves as a root for subtrees", + "source_ids": [ + 104 + ], + "id": "Name: section node\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "depth", + "entity_type": "MEASUREMENT", + "description": "depth is a specified parameter determining the level of the target section nodes", + "source_ids": [ + 104 + ], + "id": "Name: depth\nType: MEASUREMENT" + }, + { + "entity_name": "descendant", + "entity_type": "TASK_OR_PROBLEM", + "description": "descendant refers to the nodes below the target section nodes that are retrieved to form the selected node set", + "source_ids": [ + 104 + ], + "id": "Name: descendant\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "formula (5)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable N_s as a glyph value for an element s in set S within a target subtree. LaTeX: 𝑁 𝑠 = GLYPH<216> 𝑠 ∈ 𝑆 target Subtree ( 𝑠 ) (5)", + "source_ids": [ + 105 + ], + "id": "Name: formula (5)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "reasoner", + "entity_type": "TASK_OR_PROBLEM", + "description": "reasoner is described as a component that analyzes and refines selected tree nodes", + "source_ids": [ + 106 + ], + "id": "Name: reasoner\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "graph reasoning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "graph reasoning is a method that performs multi hop inference on a subgraph starting from an entity", + "source_ids": [ + 106 + ], + "id": "Name: graph reasoning\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "pagerank algorithm", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "pagerank algorithm is used to compute an entity importance vector over a subgraph", + "source_ids": [ + 106 + ], + "id": "Name: pagerank algorithm\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "gt link matrix", + "entity_type": "SOFTWARE", + "description": "gt link matrix is a matrix used to map entity scores to tree nodes to derive importance scores", + "source_ids": [ + 106 + ], + "id": "Name: gt link matrix\nType: SOFTWARE" + }, + { + "entity_name": "entity importance vector", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "entity importance vector is a vector computed over a subgraph representing the importance of entities", + "source_ids": [ + 106 + ], + "id": "Name: entity importance vector\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "tree node importance scores vector", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "tree node importance scores vector is the final vector derived by mapping entity scores to tree nodes", + "source_ids": [ + 106 + ], + "id": "Name: tree node importance scores vector\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "subgraph", + "entity_type": "TASK_OR_PROBLEM", + "description": "subgraph is a portion of a graph extracted from selected nodes on which inference is performed", + "source_ids": [ + 106 + ], + "id": "Name: subgraph\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "entity", + "entity_type": "TASK_OR_PROBLEM", + "description": "Entity serves as the starting point for the multi-hop inference process in graph reasoning and is also the object that the agent attempts to extract during the single-hop process.", + "source_ids": [ + 106, + 115 + ], + "id": "Name: entity\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "selected nodes", + "entity_type": "TASK_OR_PROBLEM", + "description": "selected nodes are the nodes from which a subgraph is extracted for graph reasoning", + "source_ids": [ + 106 + ], + "id": "Name: selected nodes\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "20", + "entity_type": "PUBLICATION_VENUE", + "description": "20 is a citation reference associated with the pagerank algorithm", + "source_ids": [ + 106 + ], + "id": "Name: 20\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "6", + "entity_type": "EQUATION_OR_FORMULA", + "description": "6 is the label for the equation defining the entity importance vector", + "source_ids": [ + 106 + ], + "id": "Name: 6\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "7", + "entity_type": "EQUATION_OR_FORMULA", + "description": "7 is the label for the equation defining the tree node importance scores vector", + "source_ids": [ + 106 + ], + "id": "Name: 7\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "selected tree nodes", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 106 + ], + "id": "Name: selected tree nodes\nType: UNKNOWN" + }, + { + "entity_name": "formula (6)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the PageRank of a graph G with respect to a vector e'. LaTeX: 𝐼 𝐺 = PageRank ( 𝐺 , 𝑒 ' ) (6)", + "source_ids": [ + 107 + ], + "id": "Name: formula (6)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "formula (7)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the product of S and G as equal to the product of I, G, and M. LaTeX: 𝑆 𝐺 = 𝐼 𝐺 × 𝑀 (7)", + "source_ids": [ + 108 + ], + "id": "Name: formula (7)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "text ranker", + "entity_type": "SOFTWARE", + "description": "text ranker is a system that evaluates the semantic relevance of a tree node s content to a query", + "source_ids": [ + 109 + ], + "id": "Name: text ranker\nType: SOFTWARE" + }, + { + "entity_name": "skyline ranker", + "entity_type": "SOFTWARE", + "description": "Skyline Ranker is a software system that employs the skyline operator to filter nodes based on multiple criteria and retains a specific number of nodes after analysis. It functions as a component that is disabled when the graph reasoning operator is removed, resulting in single-dimensional scoring, and is also disabled when text reasoning is removed, causing it to rely solely on graph-based scores.", + "source_ids": [ + 168, + 169, + 109, + 157 + ], + "id": "Name: skyline ranker\nType: SOFTWARE" + }, + { + "entity_name": "skyline operator", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the skyline operator is a method used by skyline ranker to filter nodes based on scoring dimensions", + "source_ids": [ + 109 + ], + "id": "Name: skyline operator\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "query", + "entity_type": "TASK_OR_PROBLEM", + "description": "The query serves as the input for which semantic relevance is evaluated by the text ranker. It is also the input that the agent classifies into a category to generate a plan, a process handled by the agent based planning component. Additionally, a query is a specific question for which retrieval recall is recorded, particularly when PDF parsing errors occur.", + "source_ids": [ + 112, + 109, + 144, + 157 + ], + "id": "Name: query\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "relevance score", + "entity_type": "EVALUATION_METRIC", + "description": "the relevance score is a metric assigned to each node to indicate its semantic relevance to the query", + "source_ids": [ + 109 + ], + "id": "Name: relevance score\nType: EVALUATION_METRIC" + }, + { + "entity_name": "tree node", + "entity_type": "TASK_OR_PROBLEM", + "description": "the tree node is the content unit being evaluated for relevance and filtered based on scoring dimensions", + "source_ids": [ + 109 + ], + "id": "Name: tree node\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "nodes", + "entity_type": "TASK_OR_PROBLEM", + "description": "nodes are the data elements being evaluated for relevance and filtered by the ranking systems", + "source_ids": [ + 109 + ], + "id": "Name: nodes\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "scoring dimensions", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "scoring dimensions are the specified criteria used to determine if nodes are dominated by others", + "source_ids": [ + 109 + ], + "id": "Name: scoring dimensions\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "7", + "entity_type": "NUMBER", + "description": "7 is a number mentioned in the text", + "source_ids": [ + 110 + ], + "id": "Name: 7\nType: NUMBER" + }, + { + "entity_name": "synthesizer", + "entity_type": "TASK_OR_PROBLEM", + "description": "synthesizer is described as an operator responsible for content generation", + "source_ids": [ + 111 + ], + "id": "Name: synthesizer\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "content generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "content generation is the primary responsibility of the synthesizer operators mentioned in the text", + "source_ids": [ + 111 + ], + "id": "Name: content generation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "analysis", + "entity_type": "TASK_OR_PROBLEM", + "description": "analysis is the specific action performed by the map operator on retrieved information segments", + "source_ids": [ + 111 + ], + "id": "Name: analysis\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "partial responses", + "entity_type": "PRODUCT", + "description": "partial responses are the output generated by the map operator from specific retrieved information segments", + "source_ids": [ + 111 + ], + "id": "Name: partial responses\nType: PRODUCT" + }, + { + "entity_name": "final coherent answer", + "entity_type": "PRODUCT", + "description": "a final coherent answer is the result synthesized by the reduce operator by aggregating information from multiple sources", + "source_ids": [ + 111 + ], + "id": "Name: final coherent answer\nType: PRODUCT" + }, + { + "entity_name": "retrieved information segments", + "entity_type": "DATASET_OR_CORPUS", + "description": "retrieved information segments are the specific data parts that the map operator analyzes", + "source_ids": [ + 111 + ], + "id": "Name: retrieved information segments\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "multiple sources", + "entity_type": "DATASET_OR_CORPUS", + "description": "multiple sources refer to the various origins of information such as partial answers or retrieved evidence that the reduce operator aggregates", + "source_ids": [ + 111 + ], + "id": "Name: multiple sources\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "partial answers", + "entity_type": "PRODUCT", + "description": "partial answers are one of the types of information collected from multiple sources by the reduce operator", + "source_ids": [ + 111 + ], + "id": "Name: partial answers\nType: PRODUCT" + }, + { + "entity_name": "retrieved evidence", + "entity_type": "DATASET_OR_CORPUS", + "description": "retrieved evidence is one of the types of information collected from multiple sources by the reduce operator", + "source_ids": [ + 111 + ], + "id": "Name: retrieved evidence\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "agent", + "entity_type": "PERSON", + "description": "The agent is an entity that classifies queries, generates executable plans and workflows, attempts to extract entities, executes selection strategies, and performs the decomposition of problems along with the synthesis of results.", + "source_ids": [ + 112, + 115, + 118, + 135 + ], + "id": "Name: agent\nType: PERSON" + }, + { + "entity_name": "category", + "entity_type": "TASK_OR_PROBLEM", + "description": "the category is the classification result of the query used by the agent", + "source_ids": [ + 112 + ], + "id": "Name: category\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "library", + "entity_type": "ORGANIZATION", + "description": "the library is a collection of operators from which the agent selects a sequence", + "source_ids": [ + 112 + ], + "id": "Name: library\nType: ORGANIZATION" + }, + { + "entity_name": "operators", + "entity_type": "TASK_OR_PROBLEM", + "description": "operators are the specific sequence elements selected from the library to form the plan", + "source_ids": [ + 112 + ], + "id": "Name: operators\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "parameters", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "parameters are dynamically instantiated based on the query to configure the operators", + "source_ids": [ + 112 + ], + "id": "Name: parameters\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "1", + "entity_type": "TASK_OR_PROBLEM", + "description": "1 represents the specific sequence of operators selected for the plan", + "source_ids": [ + 112 + ], + "id": "Name: 1\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "agent plan", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "agent plan is the specific formulation or function used to generate the plan from the query category and library", + "source_ids": [ + 112 + ], + "id": "Name: agent plan\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "equation 8", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 8 is the mathematical formulation agent plan describing the plan generation process", + "source_ids": [ + 112 + ], + "id": "Name: equation 8\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "formula (8)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable P as a function of Agent Plan with inputs q, c, and O. LaTeX: 𝑃 = Agent Plan ( 𝑞, 𝑐, O) (8)", + "source_ids": [ + 113 + ], + "id": "Name: formula (8)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "the plan", + "entity_type": "TASK_OR_PROBLEM", + "description": "the plan is a structured workflow tailored to each category", + "source_ids": [ + 114 + ], + "id": "Name: the plan\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "workflow", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the workflow is a structured process followed by the plan", + "source_ids": [ + 114 + ], + "id": "Name: workflow\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "category", + "entity_type": "CONCEPT", + "description": "category refers to the classifications to which the plan s workflow is tailored", + "source_ids": [ + 114 + ], + "id": "Name: category\nType: CONCEPT" + }, + { + "entity_name": "scent based", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "scent based is a selection strategy used by the agent if entity extraction is successful", + "source_ids": [ + 115 + ], + "id": "Name: scent based\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "section based", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "section based is a fallback strategy used by the agent if entity extraction fails", + "source_ids": [ + 115 + ], + "id": "Name: section based\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "standard reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "standard reasoning is a process that both selection paths proceed to", + "source_ids": [ + 115 + ], + "id": "Name: standard reasoning\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "p std", + "entity_type": "EQUATION_OR_FORMULA", + "description": "p std denotes the standard reasoning and generation process", + "source_ids": [ + 115 + ], + "id": "Name: p std\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "formula (9)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable P_s based on extraction success or failure conditions. LaTeX: 𝑃 s = ( Extract success - - - - -→ Select_by_Entity → 𝑃 std Extract fail - -→ Select_by_Section → 𝑃 std (9)", + "source_ids": [ + 116 + ], + "id": "Name: formula (9)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "formula (10)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the standard probability P as a process involving graph and text inputs leading to a skyline reduction. LaTeX: 𝑃 std = ( Graph ∥ Text ) → Skyline → Reduce (10)", + "source_ids": [ + 117 + ], + "id": "Name: formula (10)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "single hop workflow", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "single hop workflow is a method denoted as ps used to solve sub problems", + "source_ids": [ + 118 + ], + "id": "Name: single hop workflow\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "ps", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "ps is the specific notation or identifier for the single hop workflow applied to sub problems", + "source_ids": [ + 118 + ], + "id": "Name: ps\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "complex", + "entity_type": "TASK_OR_PROBLEM", + "description": "Complex is one of the three categories used to classify user questions and refers to a problem that is decomposed by the agent.", + "source_ids": [ + 241, + 118 + ], + "id": "Name: complex\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "formula (11)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation describing a decomposition process involving mapping and reduction. LaTeX: 𝑃 complex = Decompose → 𝑃 s → Map → Reduce (11)", + "source_ids": [ + 119 + ], + "id": "Name: formula (11)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "global aggregation", + "entity_type": "TASK_OR_PROBLEM", + "description": "Global aggregation is a workflow involving a sequence of filters followed by synthesis, and it is also a type of query used to evaluate the performance of BookRag.", + "source_ids": [ + 120, + 179 + ], + "id": "Name: global aggregation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "formula (12)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the global probability P as a composition of filtering and mapping operations. LaTeX: 𝑃 global = GLYPH<214> ( Filter_Modal | Filter_Range ) → Map → Reduce (12)", + "source_ids": [ + 121 + ], + "id": "Name: formula (12)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "modal filter", + "entity_type": "TECHNOLOGY", + "description": "modal filter is a type of filter applied at each step of the nested composition", + "source_ids": [ + 122 + ], + "id": "Name: modal filter\nType: TECHNOLOGY" + }, + { + "entity_name": "range filter", + "entity_type": "TECHNOLOGY", + "description": "range filter is a type of filter applied at each step of the nested composition", + "source_ids": [ + 122 + ], + "id": "Name: range filter\nType: TECHNOLOGY" + }, + { + "entity_name": "nested composition", + "entity_type": "TASK_OR_PROBLEM", + "description": "nested composition refers to the process of applying filters at each step", + "source_ids": [ + 122 + ], + "id": "Name: nested composition\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "5.3 structured execution", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Agent-Based Retrieval', this section details the retrieval process within the BookRAG framework, specifically focusing on operations executed under the principles of In-Context Few-Shot Training (IFT) and generation.", + "source_ids": [ + 123 + ], + "id": "Name: 5.3 structured execution\nType: SECTION_TITLE" + }, + { + "entity_name": "ift principles", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Refers to the In-Context Few-Shot Training principles that guide the execution logic detailed in section 5.3.", + "source_ids": [ + 123 + ], + "id": "Name: ift principles\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "synthesizer", + "entity_type": "SOFTWARE", + "description": "The synthesizer is an operator within BookRag that generates a coherent final answer by aggregating and processing refined evidence.", + "source_ids": [ + 129, + 124 + ], + "id": "Name: synthesizer\nType: SOFTWARE" + }, + { + "entity_name": "p", + "entity_type": "TASK_OR_PROBLEM", + "description": "p represents the specific generated workflow executed by bookrag", + "source_ids": [ + 124 + ], + "id": "Name: p\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "abstract textual queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "abstract textual queries are the input that bookrag translates into concrete operations", + "source_ids": [ + 124 + ], + "id": "Name: abstract textual queries\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "concrete operations", + "entity_type": "TASK_OR_PROBLEM", + "description": "concrete operations are the result of translating abstract textual queries within bookrag", + "source_ids": [ + 124 + ], + "id": "Name: concrete operations\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "information patches", + "entity_type": "TASK_OR_PROBLEM", + "description": "information patches are specific scopes within the document space that the selector navigates to", + "source_ids": [ + 124 + ], + "id": "Name: information patches\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "document space", + "entity_type": "TASK_OR_PROBLEM", + "description": "document space is the vast area of documents that is narrowed down by the selector", + "source_ids": [ + 124 + ], + "id": "Name: document space\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "relevant scopes", + "entity_type": "TASK_OR_PROBLEM", + "description": "relevant scopes are the focused areas within the document space identified by the selector", + "source_ids": [ + 124 + ], + "id": "Name: relevant scopes\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "sensemaking", + "entity_type": "TASK_OR_PROBLEM", + "description": "sensemaking is the process performed by the reasoner to analyze and refine information", + "source_ids": [ + 124 + ], + "id": "Name: sensemaking\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "processed evidence", + "entity_type": "TASK_OR_PROBLEM", + "description": "processed evidence is the refined information used by the synthesizer to generate the answer", + "source_ids": [ + 124 + ], + "id": "Name: processed evidence\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "cost of attention", + "entity_type": "TASK_OR_PROBLEM", + "description": "cost of attention is a metric minimized by bookrag s design to focus computational resources", + "source_ids": [ + 124 + ], + "id": "Name: cost of attention\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "computational resources", + "entity_type": "TASK_OR_PROBLEM", + "description": "computational resources are the assets focused by bookrag on high value data patches", + "source_ids": [ + 124 + ], + "id": "Name: computational resources\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "high value data patches", + "entity_type": "TASK_OR_PROBLEM", + "description": "high value data patches are the specific data areas where bookrag focuses its computational resources", + "source_ids": [ + 124 + ], + "id": "Name: high value data patches\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "scent filter based retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "scent filter based retrieval is a process described as the execution that begins by narrowing the scope", + "source_ids": [ + 125 + ], + "id": "Name: scent filter based retrieval\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "selector operators", + "entity_type": "SOFTWARE", + "description": "selector operators are components that identify relevant patches by following information scents or applying explicit filter constraints", + "source_ids": [ + 125 + ], + "id": "Name: selector operators\nType: SOFTWARE" + }, + { + "entity_name": "node set n", + "entity_type": "DATASET_OR_CORPUS", + "description": "node set n represents the full set of nodes that is reduced by the process", + "source_ids": [ + 125 + ], + "id": "Name: node set n\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "focused node subset ns", + "entity_type": "DATASET_OR_CORPUS", + "description": "focused node subset ns is the result of the reduction process applied to the full node set n", + "source_ids": [ + 125 + ], + "id": "Name: focused node subset ns\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "params sel", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "params sel are parameters used in the selector function to define the focused node subset", + "source_ids": [ + 125 + ], + "id": "Name: params sel\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "patches", + "entity_type": "PRODUCT", + "description": "patches are relevant units identified by selector operators within the retrieval process", + "source_ids": [ + 125 + ], + "id": "Name: patches\nType: PRODUCT" + }, + { + "entity_name": "explicit filter constraints", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "explicit filter constraints are rules applied by selector operators to identify relevant patches", + "source_ids": [ + 125 + ], + "id": "Name: explicit filter constraints\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "equation 13", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 13 defines the mathematical relationship for the selector function reducing the node set", + "source_ids": [ + 125 + ], + "id": "Name: equation 13\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "formula (13)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable Ns as a selector function applied to N and parameters. LaTeX: 𝑁 𝑠 = Selector ( 𝑁, params sel ) (13)", + "source_ids": [ + 126 + ], + "id": "Name: formula (13)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "reasoner operators", + "entity_type": "TASK_OR_PROBLEM", + "description": "reasoner operators are components that evaluate nodes using multiple dimensions such as graph topology and semantic relevance", + "source_ids": [ + 127 + ], + "id": "Name: reasoner operators\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "skyline ranker", + "entity_type": "TASK_OR_PROBLEM", + "description": "skyline ranker is a method employed to obtain the final retrieval set by retaining the pareto frontier of nodes", + "source_ids": [ + 127 + ], + "id": "Name: skyline ranker\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "skyline operator", + "entity_type": "TASK_OR_PROBLEM", + "description": "the skyline operator is a mechanism that retains valuable nodes in at least one dimension while discarding dominated ones", + "source_ids": [ + 127 + ], + "id": "Name: skyline operator\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "n r", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "n r represents the final retrieval set derived from the skyline ranker process", + "source_ids": [ + 127 + ], + "id": "Name: n r\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "s g n s", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "s g n s is a function or metric used within the skyline ranker equation to evaluate nodes", + "source_ids": [ + 127 + ], + "id": "Name: s g n s\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "t n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "t n is a function or metric used within the skyline ranker equation to evaluate nodes", + "source_ids": [ + 127 + ], + "id": "Name: t n\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "n s", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "n s represents the set of nodes from which the final retrieval set is derived", + "source_ids": [ + 127 + ], + "id": "Name: n s\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "equation 14", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 14 defines the mathematical relationship for calculating the final retrieval set n r using the skyline ranker", + "source_ids": [ + 127 + ], + "id": "Name: equation 14\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "graph topology", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "graph topology is a dimension used by reasoner operators to evaluate nodes", + "source_ids": [ + 127 + ], + "id": "Name: graph topology\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "semantic relevance", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "semantic relevance is a dimension used by reasoner operators to evaluate nodes", + "source_ids": [ + 127 + ], + "id": "Name: semantic relevance\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "pareto frontier", + "entity_type": "CONCEPT", + "description": "the pareto frontier is the set of nodes retained by the skyline operator that are valuable in at least one dimension", + "source_ids": [ + 127 + ], + "id": "Name: pareto frontier\nType: CONCEPT" + }, + { + "entity_name": "fixed top retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "fixed top retrieval is a method contrasted with the skyline operator for its inability to retain the pareto frontier", + "source_ids": [ + 127 + ], + "id": "Name: fixed top retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "noise", + "entity_type": "CONCEPT", + "description": "noise is a factor minimized by the pre selection process to optimize foraging cost", + "source_ids": [ + 127 + ], + "id": "Name: noise\nType: CONCEPT" + }, + { + "entity_name": "foraging cost", + "entity_type": "MEASUREMENT", + "description": "foraging cost is the metric optimized by minimizing noise and focusing on relevant contexts", + "source_ids": [ + 127 + ], + "id": "Name: foraging cost\nType: MEASUREMENT" + }, + { + "entity_name": "pre selection", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "pre selection is a process that minimizes noise and ensures reasoning is applied only to highly relevant contexts", + "source_ids": [ + 127 + ], + "id": "Name: pre selection\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "nodes", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 127 + ], + "id": "Name: nodes\nType: UNKNOWN" + }, + { + "entity_name": "final retrieval set", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 127 + ], + "id": "Name: final retrieval set\nType: UNKNOWN" + }, + { + "entity_name": "formula (14)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable NR as a Skyline Ranker applied to a set of SG and T values. LaTeX: 𝑁 𝑅 = Skyline_Ranker ({ 𝑆 𝐺 ( 𝑛 , 𝑆 ) 𝑇 ( 𝑛 ) | 𝑛 ∈ 𝑁 𝑠 }) (14)", + "source_ids": [ + 128 + ], + "id": "Name: formula (14)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "analysis merging generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "analysis merging generation is described as the final stage of a process involving the synthesizer operator", + "source_ids": [ + 129 + ], + "id": "Name: analysis merging generation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "15", + "entity_type": "EQUATION_OR_FORMULA", + "description": "15 is the label or identifier for the equation describing the synthesizer operator s function", + "source_ids": [ + 129 + ], + "id": "Name: 15\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "formula (15)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable A as the output of a Synthesizer function. LaTeX: 𝐴 = Synthesizer ( 𝑞, 𝑁 𝑅 ) (15)", + "source_ids": [ + 130 + ], + "id": "Name: formula (15)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "table: cref='#/texts/136'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/136'", + "source_ids": [ + 132 + ], + "id": "Name: table: cref='#/texts/136'...\nType: TABLE" + }, + { + "entity_name": "cref", + "entity_type": "EQUATION_OR_FORMULA", + "description": "A cross-reference identifier or formula string found in the description, pointing to a specific text location ('#/texts/136').", + "source_ids": [ + 132 + ], + "id": "Name: cref\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "8", + "entity_type": "MEASUREMENT", + "description": "8 is a numerical value mentioned in the text likely representing a count or identifier", + "source_ids": [ + 133 + ], + "id": "Name: 8\nType: MEASUREMENT" + }, + { + "entity_name": "map operator", + "entity_type": "TASK_OR_PROBLEM", + "description": "the map operator is a component that performs fine grained analysis on individual evidence blocks or sub problems", + "source_ids": [ + 134 + ], + "id": "Name: map operator\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "decompose", + "entity_type": "TASK_OR_PROBLEM", + "description": "decompose is a process that generates sub problems which are analyzed by the map operator", + "source_ids": [ + 134 + ], + "id": "Name: decompose\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "reduce operator", + "entity_type": "TASK_OR_PROBLEM", + "description": "the reduce operator is a component that aggregates partial results to construct the final response", + "source_ids": [ + 134 + ], + "id": "Name: reduce operator\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "global filter", + "entity_type": "TASK_OR_PROBLEM", + "description": "the global filter is a mechanism used to generate statistical counts as partial results", + "source_ids": [ + 134 + ], + "id": "Name: global filter\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "evidence blocks", + "entity_type": "TASK_OR_PROBLEM", + "description": "evidence blocks are the individual units of content that the map operator analyzes", + "source_ids": [ + 134 + ], + "id": "Name: evidence blocks\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "sub problems", + "entity_type": "TASK_OR_PROBLEM", + "description": "sub problems are specific issues derived from decompose that are analyzed by the map operator", + "source_ids": [ + 134 + ], + "id": "Name: sub problems\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "intermediate insights", + "entity_type": "TASK_OR_PROBLEM", + "description": "intermediate insights are the outputs generated by the map operator during its analysis", + "source_ids": [ + 134 + ], + "id": "Name: intermediate insights\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "partial results", + "entity_type": "TASK_OR_PROBLEM", + "description": "partial results are the outputs from the map operator that are aggregated by the reduce operator", + "source_ids": [ + 134 + ], + "id": "Name: partial results\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "answers to decomposed sub queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "answers to decomposed sub queries are a type of partial result aggregated by the reduce operator", + "source_ids": [ + 134 + ], + "id": "Name: answers to decomposed sub queries\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "statistical counts", + "entity_type": "TASK_OR_PROBLEM", + "description": "statistical counts are a type of partial result derived from a global filter and aggregated by the reduce operator", + "source_ids": [ + 134 + ], + "id": "Name: statistical counts\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "final response", + "entity_type": "TASK_OR_PROBLEM", + "description": "the final response is the constructed output created by the reduce operator", + "source_ids": [ + 134 + ], + "id": "Name: final response\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "detailed content extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "detailed content extraction is a capability handled by the system s separation of map and reduce operators", + "source_ids": [ + 134 + ], + "id": "Name: detailed content extraction\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "high level reasoning synthesis", + "entity_type": "TASK_OR_PROBLEM", + "description": "high level reasoning synthesis is a capability handled by the system s separation of map and reduce operators", + "source_ids": [ + 134 + ], + "id": "Name: high level reasoning synthesis\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "figure 4 b", + "entity_type": "IMAGE", + "description": "figure 4 b is an image presenting an execution trace for a single hop query", + "source_ids": [ + 135 + ], + "id": "Name: figure 4 b\nType: IMAGE" + }, + { + "entity_name": "ranking prompt example", + "entity_type": "TASK_OR_PROBLEM", + "description": "ranking prompt example is a specific example context mentioned in the query regarding the type of car", + "source_ids": [ + 135 + ], + "id": "Name: ranking prompt example\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "select by entity", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "select by entity is a method used to retrieve relevant nodes after entity identification", + "source_ids": [ + 135 + ], + "id": "Name: select by entity\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "skyline filtering", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "skyline filtering is a technique used to refine nodes during the process", + "source_ids": [ + 135 + ], + "id": "Name: skyline filtering\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "reduce", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "reduce is a method used to synthesize the final answer", + "source_ids": [ + 135 + ], + "id": "Name: reduce\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "planning phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "the planning phase is the initial stage where the agent classifies the query and generates a workflow", + "source_ids": [ + 135 + ], + "id": "Name: planning phase\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "reasoning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Reasoning is a cognitive process of drawing conclusions that is identified as a challenge in the text and is also used as a step to refine nodes in the process.", + "source_ids": [ + 179, + 135 + ], + "id": "Name: reasoning\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "6 experiments", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section details the empirical validation of the proposed BookRAG method, including experimental setup, benchmarks used, and performance results compared to baselines.", + "source_ids": [ + 136 + ], + "id": "Name: 6 experiments\nType: SECTION_TITLE" + }, + { + "entity_name": "experiments", + "entity_type": "TASK_OR_PROBLEM", + "description": "The experiments refer to the systematic computational procedures and evaluations conducted to validate the effectiveness of the BookRAG approach, as described in section 6, and represent the activities for which the datasets in table 4 were used.", + "source_ids": [ + 136, + 139 + ], + "id": "Name: experiments\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "baseline methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "baseline methods are the strong existing approaches used for comparison against bookrag in the experiments", + "source_ids": [ + 137 + ], + "id": "Name: baseline methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "Accuracy is a primary evaluation metric used to assess performance across various contexts, including the evaluation of bookrag and baseline methods, as well as the w o selector variant despite its high computational costs. In the Qasper dataset, it is represented by blue bars and is often contrasted with exact match, implying it is a less strict measure. Specific definitions include the proportion of cases where the set of named entities in a model's response is a subset of those in the ground truth, as well as a calculation of correctness based on whether the normalized ground truth is a substring of the normalized raw response.", + "source_ids": [ + 226, + 229, + 137, + 172, + 144, + 177, + 221 + ], + "id": "Name: accuracy\nType: EVALUATION_METRIC" + }, + { + "entity_name": "6.1 setup", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experiments' within the BookRAG paper, this section details the experimental configuration, including baseline methods, evaluation metrics (efficiency and accuracy), and the document QA tasks used to assess the proposed approach.", + "source_ids": [ + 138 + ], + "id": "Name: 6.1 setup\nType: SECTION_TITLE" + }, + { + "entity_name": "table 4", + "entity_type": "TABLE", + "description": "Table 4 is a table that lists the datasets used in experiments and presents the statistics of those mentioned datasets.", + "source_ids": [ + 139, + 141 + ], + "id": "Name: table 4\nType: TABLE" + }, + { + "entity_name": "em", + "entity_type": "EVALUATION_METRIC", + "description": "em, short for exact match, is an evaluation metric used in experiments to measure question answering performance.", + "source_ids": [ + 170, + 139 + ], + "id": "Name: em\nType: EVALUATION_METRIC" + }, + { + "entity_name": "f1", + "entity_type": "EVALUATION_METRIC", + "description": "F1 denotes the F1 score, an evaluation metric used in experiments to measure question answering performance.", + "source_ids": [ + 170, + 139 + ], + "id": "Name: f1\nType: EVALUATION_METRIC" + }, + { + "entity_name": "exact match", + "entity_type": "EVALUATION_METRIC", + "description": "Exact match, abbreviated as EM, is a strict evaluation metric used to assess performance in text-based tasks by measuring whether the normalized extracted answer is character-for-character identical to the ground truth. It serves as a primary metric for evaluating performance, including comparisons of BookRAG against baselines, and is represented by blue bars in the MMLongBench benchmark.", + "source_ids": [ + 229, + 170, + 139, + 144, + 177, + 152 + ], + "id": "Name: exact match\nType: EVALUATION_METRIC" + }, + { + "entity_name": "f1 score", + "entity_type": "EVALUATION_METRIC", + "description": "The F1 score is an evaluation metric used to measure the performance of text span answers by comparing extracted answers to ground truth, and it is often represented by red bars.", + "source_ids": [ + 177, + 170, + 139, + 231 + ], + "id": "Name: f1 score\nType: EVALUATION_METRIC" + }, + { + "entity_name": "datasets", + "entity_type": "DATASET_OR_CORPUS", + "description": "Datasets refer to the collection of data used in experiments, serving as various collections to evaluate the performance of methods, including the gradient-based ER method.", + "source_ids": [ + 176, + 153, + 139 + ], + "id": "Name: datasets\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "our", + "entity_type": "ORGANIZATION", + "description": "our refers to the research group or team conducting the experiments mentioned in the text", + "source_ids": [ + 139 + ], + "id": "Name: our\nType: ORGANIZATION" + }, + { + "entity_name": "table: cref='#/texts/143'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/143'", + "source_ids": [ + 140 + ], + "id": "Name: table: cref='#/texts/143'...\nType: TABLE" + }, + { + "entity_name": "texts/143", + "entity_type": "SECTION_TITLE", + "description": "A reference identifier extracted from the description string 'cref='#/texts/143'', likely pointing to a specific section or text element within a document structure.", + "source_ids": [ + 140 + ], + "id": "Name: texts/143\nType: SECTION_TITLE" + }, + { + "entity_name": "mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "mmlongbench is a comprehensive benchmark and dataset designed to evaluate question-answering capabilities on long-form documents, providing page numbers for filtering candidate blocks and measuring exact match performance. It is utilized to assess the token consumption of systems like DocETL and BookRAG, generate query types for case studies, and support global aggregation analyses. Additionally, the dataset served as the basis for sampling 200 queries for error analysis and was featured in comparative evaluations and charts.", + "source_ids": [ + 160, + 141, + 175, + 144, + 177, + 181, + 182, + 183, + 159 + ], + "id": "Name: mmlongbench\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "m3docvqa", + "entity_type": "DATASET_OR_CORPUS", + "description": "m3docvqa is an open-domain benchmark dataset designed to test retrieval-augmented generation (RAG) systems on HTML-type documents from Wikipedia. It is specifically used to evaluate the exact match performance and retrieval recall of the BookRAG system, serving as the second dataset in comparative evaluations.", + "source_ids": [ + 152, + 141, + 157, + 159 + ], + "id": "Name: m3docvqa\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "Qasper is a question-answering dataset focused on scientific papers that requires evidence retrieval from the entire document and provides evidence statements for filtering candidate blocks. It is used to measure computational cost in tokens, evaluate accuracy performance, and generate query types in case studies. Additionally, Qasper serves as the source dataset for single-hop and multi-hop case studies, and 200 sampled queries were taken from it for error analysis.", + "source_ids": [ + 172, + 141, + 175, + 144, + 177, + 181, + 182, + 183, + 159 + ], + "id": "Name: qasper\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "human annotators", + "entity_type": "PERSON", + "description": "human annotators are individuals who answer and refine the synthesized qa pairs", + "source_ids": [ + 141 + ], + "id": "Name: human annotators\nType: PERSON" + }, + { + "entity_name": "20", + "entity_type": "PERCENTAGE", + "description": "20 represents the proportion of the final QA pairs that are synthesized additional pairs and is also the percentage increase in graph density achieved by the gradient-based ER method across datasets.", + "source_ids": [ + 176, + 141 + ], + "id": "Name: 20\nType: PERCENTAGE" + }, + { + "entity_name": "html type documents", + "entity_type": "PRODUCT", + "description": "html type documents are the source material for the m3docvqa benchmark", + "source_ids": [ + 141 + ], + "id": "Name: html type documents\nType: PRODUCT" + }, + { + "entity_name": "wikipedia pages", + "entity_type": "LOCATION", + "description": "wikipedia pages are the specific source of the html type documents used in m3docvqa", + "source_ids": [ + 141 + ], + "id": "Name: wikipedia pages\nType: LOCATION" + }, + { + "entity_name": "guidebooks", + "entity_type": "PRODUCT", + "description": "guidebooks are one of the diverse categories of long form documents covered by mmlongbench", + "source_ids": [ + 141 + ], + "id": "Name: guidebooks\nType: PRODUCT" + }, + { + "entity_name": "financial reports", + "entity_type": "PRODUCT", + "description": "financial reports are one of the diverse categories of long form documents covered by mmlongbench", + "source_ids": [ + 141 + ], + "id": "Name: financial reports\nType: PRODUCT" + }, + { + "entity_name": "industry files", + "entity_type": "PRODUCT", + "description": "industry files are one of the diverse categories of long form documents covered by mmlongbench", + "source_ids": [ + 141 + ], + "id": "Name: industry files\nType: PRODUCT" + }, + { + "entity_name": "scientific papers", + "entity_type": "PRODUCT", + "description": "scientific papers are the focus of the qasper dataset", + "source_ids": [ + 141 + ], + "id": "Name: scientific papers\nType: PRODUCT" + }, + { + "entity_name": "figures", + "entity_type": "IMAGE", + "description": "Figures are visual elements mentioned in example queries regarding counting and serve as document elements from which the LLM generates global questions.", + "source_ids": [ + 258, + 141 + ], + "id": "Name: figures\nType: IMAGE" + }, + { + "entity_name": "global level questions", + "entity_type": "TASK_OR_PROBLEM", + "description": "global level questions are the specific type of questions synthesized to address scarcity in original benchmarks", + "source_ids": [ + 141 + ], + "id": "Name: global level questions\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "qa pairs", + "entity_type": "TASK_OR_PROBLEM", + "description": "qa pairs are the output units generated by the llm and refined by human annotators", + "source_ids": [ + 141 + ], + "id": "Name: qa pairs\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "rag systems", + "entity_type": "SOFTWARE", + "description": "rag systems are the target systems tested by the m3docvqa benchmark", + "source_ids": [ + 141 + ], + "id": "Name: rag systems\nType: SOFTWARE" + }, + { + "entity_name": "wikipedia", + "entity_type": "ORGANIZATION", + "description": "wikipedia is an organization associated with the url provided in the text", + "source_ids": [ + 142 + ], + "id": "Name: wikipedia\nType: ORGANIZATION" + }, + { + "entity_name": "https www wikipedia org", + "entity_type": "LOCATION", + "description": "https www wikipedia org is a web address mentioned in the text", + "source_ids": [ + 142 + ], + "id": "Name: https www wikipedia org\nType: LOCATION" + }, + { + "entity_name": "token based f1 score", + "entity_type": "EVALUATION_METRIC", + "description": "token based f1 score is a primary evaluation metric used to assess performance in the text", + "source_ids": [ + 144 + ], + "id": "Name: token based f1 score\nType: EVALUATION_METRIC" + }, + { + "entity_name": "time cost", + "entity_type": "EVALUATION_METRIC", + "description": "time cost is a metric used to assess efficiency during the response phase", + "source_ids": [ + 144 + ], + "id": "Name: time cost\nType: EVALUATION_METRIC" + }, + { + "entity_name": "token usage", + "entity_type": "EVALUATION_METRIC", + "description": "token usage is a metric used to assess efficiency during the response phase", + "source_ids": [ + 144 + ], + "id": "Name: token usage\nType: EVALUATION_METRIC" + }, + { + "entity_name": "pdf parsing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "pdf parsing is a method mentioned in the text that is evaluated using retrieval recall", + "source_ids": [ + 144 + ], + "id": "Name: pdf parsing\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "texts", + "entity_type": "TABLE", + "description": "texts are specific pdf blocks labeled to establish ground truth", + "source_ids": [ + 144 + ], + "id": "Name: texts\nType: TABLE" + }, + { + "entity_name": "titles", + "entity_type": "TABLE", + "description": "titles are specific pdf blocks labeled to establish ground truth", + "source_ids": [ + 144 + ], + "id": "Name: titles\nType: TABLE" + }, + { + "entity_name": "images", + "entity_type": "TABLE", + "description": "images are specific pdf blocks labeled to establish ground truth", + "source_ids": [ + 144 + ], + "id": "Name: images\nType: TABLE" + }, + { + "entity_name": "formulas", + "entity_type": "TABLE", + "description": "formulas are specific pdf blocks labeled to establish ground truth", + "source_ids": [ + 144 + ], + "id": "Name: formulas\nType: TABLE" + }, + { + "entity_name": "ground truth", + "entity_type": "CONCEPT", + "description": "Ground truth is the established standard used to evaluate retrieval recall and guide manual labeling, referring to the correct or expected answer used as a benchmark for evaluation.", + "source_ids": [ + 144, + 224 + ], + "id": "Name: ground truth\nType: CONCEPT" + }, + { + "entity_name": "metadata", + "entity_type": "CONCEPT", + "description": "metadata refers to the ground truth evidence information provided in each dataset that guides the labeling process", + "source_ids": [ + 144 + ], + "id": "Name: metadata\nType: CONCEPT" + }, + { + "entity_name": "modality", + "entity_type": "CONCEPT", + "description": "modality is a given attribute used to filter candidate blocks across all datasets", + "source_ids": [ + 144 + ], + "id": "Name: modality\nType: CONCEPT" + }, + { + "entity_name": "pdf blocks", + "entity_type": "TABLE", + "description": "pdf blocks are the specific units of content texts titles tables images formulas that are manually labeled", + "source_ids": [ + 144 + ], + "id": "Name: pdf blocks\nType: TABLE" + }, + { + "entity_name": "candidate blocks", + "entity_type": "TABLE", + "description": "candidate blocks are the set of blocks filtered using modality page numbers and evidence statements before manual annotation", + "source_ids": [ + 144 + ], + "id": "Name: candidate blocks\nType: TABLE" + }, + { + "entity_name": "response phase", + "entity_type": "TIME", + "description": "the response phase is the specific time period during which time cost and token usage are measured", + "source_ids": [ + 144 + ], + "id": "Name: response phase\nType: TIME" + }, + { + "entity_name": "page numbers", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 144 + ], + "id": "Name: page numbers\nType: UNKNOWN" + }, + { + "entity_name": "evidence statements", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 144 + ], + "id": "Name: evidence statements\nType: UNKNOWN" + }, + { + "entity_name": "baselines", + "entity_type": "TASK_OR_PROBLEM", + "description": "baselines refer to the standard configurations used for comparison in the experiments", + "source_ids": [ + 145 + ], + "id": "Name: baselines\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "three model configurations", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "three model configurations are the specific experimental setups considered in the study", + "source_ids": [ + 145 + ], + "id": "Name: three model configurations\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "our experiments", + "entity_type": "EVENT", + "description": "our experiments refer to the specific study or set of trials being conducted to evaluate the model configurations", + "source_ids": [ + 145 + ], + "id": "Name: our experiments\nType: EVENT" + }, + { + "entity_name": "conventional rag", + "entity_type": "TASK_OR_PROBLEM", + "description": "conventional rag is described as the most common pipeline for document analysis involving text extraction and chunking", + "source_ids": [ + 146 + ], + "id": "Name: conventional rag\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "bm25", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "bm25 is identified as a strong and widely used retrieval model selected for implementation", + "source_ids": [ + 146 + ], + "id": "Name: bm25\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "vanilla rag", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "vanilla rag is identified as a strong and widely used retrieval model selected for implementation", + "source_ids": [ + 146 + ], + "id": "Name: vanilla rag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "layout vanilla", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "layout vanilla is a variant of vanilla rag that utilizes document layout analysis for semantic chunking", + "source_ids": [ + 146 + ], + "id": "Name: layout vanilla\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "document analysis", + "entity_type": "TASK_OR_PROBLEM", + "description": "document analysis is the general task where raw text is extracted and processed in the described pipeline", + "source_ids": [ + 146 + ], + "id": "Name: document analysis\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "raw text", + "entity_type": "MATERIAL", + "description": "raw text is the input material that is first extracted in the pipeline", + "source_ids": [ + 146 + ], + "id": "Name: raw text\nType: MATERIAL" + }, + { + "entity_name": "segments", + "entity_type": "MEASUREMENT", + "description": "segments are the chunks of specified size that the raw text is divided into", + "source_ids": [ + 146 + ], + "id": "Name: segments\nType: MEASUREMENT" + }, + { + "entity_name": "document layout analysis", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "document layout analysis is the technique used by layout vanilla for semantic chunking", + "source_ids": [ + 146 + ], + "id": "Name: document layout analysis\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "semantic chunking", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "semantic chunking is the process of dividing text into segments based on meaning utilized by layout vanilla", + "source_ids": [ + 146 + ], + "id": "Name: semantic chunking\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "raptor", + "entity_type": "TECHNOLOGY", + "description": "raptor is a specific technology selected as an example of graph based rag methods", + "source_ids": [ + 147 + ], + "id": "Name: raptor\nType: TECHNOLOGY" + }, + { + "entity_name": "graphrag", + "entity_type": "TECHNOLOGY", + "description": "graphrag is a specific technology selected as an example of graph based rag methods", + "source_ids": [ + 147 + ], + "id": "Name: graphrag\nType: TECHNOLOGY" + }, + { + "entity_name": "graphrag global", + "entity_type": "TECHNOLOGY", + "description": "graphrag global is a version of graphrag that employs global search methods", + "source_ids": [ + 147 + ], + "id": "Name: graphrag global\nType: TECHNOLOGY" + }, + { + "entity_name": "graphrag local", + "entity_type": "TECHNOLOGY", + "description": "graphrag local is a version of graphrag that employs local search methods", + "source_ids": [ + 147 + ], + "id": "Name: graphrag local\nType: TECHNOLOGY" + }, + { + "entity_name": "documents", + "entity_type": "PRODUCT", + "description": "documents are the textual content from which graph based rag methods extract information", + "source_ids": [ + 147 + ], + "id": "Name: documents\nType: PRODUCT" + }, + { + "entity_name": "graph data", + "entity_type": "TECHNOLOGY", + "description": "graph data is the type of data leveraged during the retrieval process in graph based rag methods", + "source_ids": [ + 147 + ], + "id": "Name: graph data\nType: TECHNOLOGY" + }, + { + "entity_name": "global search methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "global search methods are employed by the graphrag global version", + "source_ids": [ + 147 + ], + "id": "Name: global search methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "local search methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "local search methods are employed by the graphrag local version", + "source_ids": [ + 147 + ], + "id": "Name: local search methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "layoutsegmentedrag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "layoutsegmentedrag is a category of methods that utilize layout analysis to segment document content into discrete structural units", + "source_ids": [ + 148 + ], + "id": "Name: layoutsegmentedrag\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "mm vanilla", + "entity_type": "PRODUCT", + "description": "mm vanilla is a method that utilizes multi modal embeddings for visual and textual content", + "source_ids": [ + 148 + ], + "id": "Name: mm vanilla\nType: PRODUCT" + }, + { + "entity_name": "pageindex", + "entity_type": "PRODUCT", + "description": "pageindex is a method or system referenced as an inspiration for a tree based method", + "source_ids": [ + 148 + ], + "id": "Name: pageindex\nType: PRODUCT" + }, + { + "entity_name": "treetraverse", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "treetraverse is a tree based method inspired by pageindex where an llm navigates the document s tree structure", + "source_ids": [ + 148 + ], + "id": "Name: treetraverse\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "graphranker", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "GraphRanker is a graph-based method extended from HippoRAG that applies personalized PageRank to rank relevant nodes and is listed in the chart legend as a ranking method utilizing graph structures.", + "source_ids": [ + 148, + 159 + ], + "id": "Name: graphranker\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "hipporag", + "entity_type": "METHOD_OR_ARCHITECTURE", + "description": "hipporag is a method or architecture from which graphranker is extended", + "source_ids": [ + 148 + ], + "id": "Name: hipporag\nType: METHOD_OR_ARCHITECTURE" + }, + { + "entity_name": "personalized pagerank", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "personalized pagerank is a technique applied by graphranker to rank relevant nodes", + "source_ids": [ + 148 + ], + "id": "Name: personalized pagerank\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "page 39", + "entity_type": "PUBLICATION_VENUE", + "description": "page 39 is a citation reference associated with the pageindex method", + "source_ids": [ + 148 + ], + "id": "Name: page 39\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "page 47", + "entity_type": "PUBLICATION_VENUE", + "description": "page 47 is a citation reference associated with the docetl system", + "source_ids": [ + 148 + ], + "id": "Name: page 47\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "page 19", + "entity_type": "PUBLICATION_VENUE", + "description": "page 19 is a citation reference associated with the hipporag method", + "source_ids": [ + 148 + ], + "id": "Name: page 19\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "page 20", + "entity_type": "PUBLICATION_VENUE", + "description": "page 20 is a citation reference associated with the personalized pagerank technique", + "source_ids": [ + 148 + ], + "id": "Name: page 20\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "qwen family", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The Qwen family refers to a set of state-of-the-art backbone models used to power BookRAG and baseline methods.", + "source_ids": [ + 149, + 238 + ], + "id": "Name: qwen family\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "mineru", + "entity_type": "SOFTWARE", + "description": "Mineru is a software tool employed for robust document layout parsing.", + "source_ids": [ + 149, + 238 + ], + "id": "Name: mineru\nType: SOFTWARE" + }, + { + "entity_name": "github com sam234990 bookrag", + "entity_type": "LOCATION", + "description": "github com sam234990 bookrag is the url where source code prompts and configurations for bookrag are available", + "source_ids": [ + 149 + ], + "id": "Name: github com sam234990 bookrag\nType: LOCATION" + }, + { + "entity_name": "0 6", + "entity_type": "MEASUREMENT", + "description": "0 6 is the threshold value set for the gradient g in the implementation details", + "source_ids": [ + 149 + ], + "id": "Name: 0 6\nType: MEASUREMENT" + }, + { + "entity_name": "technical report", + "entity_type": "PUBLICATION_VENUE", + "description": "A technical report is a publication venue that serves as a document containing more details about an implementation, often referenced by a specific identifier such as 57, and it describes the type of document being referenced.", + "source_ids": [ + 194, + 149 + ], + "id": "Name: technical report\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "gradient g", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "gradient g is a parameter with a threshold set to 0 6 in the implementation details", + "source_ids": [ + 149 + ], + "id": "Name: gradient g\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "appendix", + "entity_type": "SECTION_TITLE", + "description": "the appendix is a section of the technical report where more details are provided", + "source_ids": [ + 149 + ], + "id": "Name: appendix\nType: SECTION_TITLE" + }, + { + "entity_name": "prompts", + "entity_type": "TASK_OR_PROBLEM", + "description": "prompts are specific instructions or inputs used in the bookrag system available on github", + "source_ids": [ + 149 + ], + "id": "Name: prompts\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "detailed configurations", + "entity_type": "TASK_OR_PROBLEM", + "description": "detailed configurations are specific settings for the bookrag system available on github", + "source_ids": [ + 149 + ], + "id": "Name: detailed configurations\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "state of theart", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "state of theart describes the quality of the backbone models used in the comparison", + "source_ids": [ + 149 + ], + "id": "Name: state of theart\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "robust document layout parsing", + "entity_type": "TASK_OR_PROBLEM", + "description": "Robust document layout parsing is the specific task performed by and utilized for Mineru.", + "source_ids": [ + 149, + 238 + ], + "id": "Name: robust document layout parsing\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "fair comparison", + "entity_type": "TASK_OR_PROBLEM", + "description": "fair comparison is the goal of the experimental setup described in the text", + "source_ids": [ + 149 + ], + "id": "Name: fair comparison\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "baseline methods", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 149 + ], + "id": "Name: baseline methods\nType: UNKNOWN" + }, + { + "entity_name": "implementation details", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 149 + ], + "id": "Name: implementation details\nType: UNKNOWN" + }, + { + "entity_name": "6.2 overall results", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experiments' within the BookRAG paper, this section presents the aggregate performance metrics comparing the proposed method against baseline approaches on document QA tasks.", + "source_ids": [ + 150 + ], + "id": "Name: 6.2 overall results\nType: SECTION_TITLE" + }, + { + "entity_name": "query efficiency", + "entity_type": "TASK_OR_PROBLEM", + "description": "query efficiency is a metric being analyzed to determine the system s performance", + "source_ids": [ + 151 + ], + "id": "Name: query efficiency\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "evaluation", + "entity_type": "EVENT", + "description": "evaluation is the comprehensive process of analyzing bookrag s performance described in the text", + "source_ids": [ + 151 + ], + "id": "Name: evaluation\nType: EVENT" + }, + { + "entity_name": "table 5", + "entity_type": "TABLE", + "description": "Table 5 is a performance comparison table that displays the results of different methods on document question answering tasks, specifically highlighting the comparison of QA performance between BookRAG and various baselines.", + "source_ids": [ + 152, + 153 + ], + "id": "Name: table 5\nType: TABLE" + }, + { + "entity_name": "layout vanilla", + "entity_type": "PRODUCT", + "description": "layout vanilla is a baseline method that consistently outperforms vanilla rag", + "source_ids": [ + 152 + ], + "id": "Name: layout vanilla\nType: PRODUCT" + }, + { + "entity_name": "vanilla rag", + "entity_type": "PRODUCT", + "description": "vanilla rag is a baseline method that is outperformed by layout vanilla", + "source_ids": [ + 152 + ], + "id": "Name: vanilla rag\nType: PRODUCT" + }, + { + "entity_name": "tree traverse", + "entity_type": "PRODUCT", + "description": "tree traverse is a method highlighted for having suboptimal results due to limitations in hierarchical navigation", + "source_ids": [ + 152 + ], + "id": "Name: tree traverse\nType: PRODUCT" + }, + { + "entity_name": "graphranker", + "entity_type": "PRODUCT", + "description": "Graphranker is a layout-based baseline system compared against BookRag, but it is highlighted for having suboptimal results due to limitations in graph-based reasoning.", + "source_ids": [ + 152, + 157 + ], + "id": "Name: graphranker\nType: PRODUCT" + }, + { + "entity_name": "tree graph bookindex", + "entity_type": "PRODUCT", + "description": "tree graph bookindex is a component of bookrag that contributes to its superior performance", + "source_ids": [ + 152 + ], + "id": "Name: tree graph bookindex\nType: PRODUCT" + }, + { + "entity_name": "agent based planning", + "entity_type": "PRODUCT", + "description": "agent based planning is a component of bookrag that contributes to its superior performance", + "source_ids": [ + 152 + ], + "id": "Name: agent based planning\nType: PRODUCT" + }, + { + "entity_name": "18 0", + "entity_type": "PERCENTAGE", + "description": "18 0 is the margin by which bookrag outperforms the top performing baseline in exact match on m3docvqa", + "source_ids": [ + 152 + ], + "id": "Name: 18 0\nType: PERCENTAGE" + }, + { + "entity_name": "qa performance", + "entity_type": "TASK_OR_PROBLEM", + "description": "QA performance is the specific task being evaluated and compared in the text, referring to the quality of answers generated which is analyzed under different query types.", + "source_ids": [ + 152, + 179 + ], + "id": "Name: qa performance\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "hierarchical navigation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "hierarchical navigation is a method used by tree traverse that is noted for missing cross sectional context", + "source_ids": [ + 152 + ], + "id": "Name: hierarchical navigation\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "graph based reasoning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "graph based reasoning is a method used by graphranker that is noted for drifting into irrelevant scopes", + "source_ids": [ + 152 + ], + "id": "Name: graph based reasoning\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "context fragmentation", + "entity_type": "TASK_OR_PROBLEM", + "description": "context fragmentation is a limitation of existing baselines that bookrag overcomes", + "source_ids": [ + 152 + ], + "id": "Name: context fragmentation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "static query workflow", + "entity_type": "TASK_OR_PROBLEM", + "description": "static query workflow is a limitation of existing baselines that bookrag overcomes", + "source_ids": [ + 152 + ], + "id": "Name: static query workflow\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "cross sectional context", + "entity_type": "CONCEPT", + "description": "cross sectional context is information often missed by methods relying solely on hierarchical navigation", + "source_ids": [ + 152 + ], + "id": "Name: cross sectional context\nType: CONCEPT" + }, + { + "entity_name": "irrelevant scopes", + "entity_type": "CONCEPT", + "description": "irrelevant scopes are areas that methods relying solely on graph based reasoning may drift into", + "source_ids": [ + 152 + ], + "id": "Name: irrelevant scopes\nType: CONCEPT" + }, + { + "entity_name": "queries", + "entity_type": "CONCEPT", + "description": "queries are inputs that bookrag effectively classifies to configure optimal workflows", + "source_ids": [ + 152 + ], + "id": "Name: queries\nType: CONCEPT" + }, + { + "entity_name": "workflows", + "entity_type": "CONCEPT", + "description": "workflows are configured by bookrag to ensure precise evidence retrieval and accurate generation", + "source_ids": [ + 152 + ], + "id": "Name: workflows\nType: CONCEPT" + }, + { + "entity_name": "baselines", + "entity_type": "PRODUCT", + "description": "baselines are the three categories of methods against which bookrag is compared", + "source_ids": [ + 152 + ], + "id": "Name: baselines\nType: PRODUCT" + }, + { + "entity_name": "top performing baseline", + "entity_type": "PRODUCT", + "description": "top performing baseline is the specific baseline that bookrag substantially outperforms", + "source_ids": [ + 152 + ], + "id": "Name: top performing baseline\nType: PRODUCT" + }, + { + "entity_name": "performance comparison", + "entity_type": "TASK_OR_PROBLEM", + "description": "performance comparison refers to the evaluation of different methods across various datasets", + "source_ids": [ + 153 + ], + "id": "Name: performance comparison\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "different methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "different methods are the various approaches being compared in the table for solving document qa tasks", + "source_ids": [ + 153 + ], + "id": "Name: different methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "best results", + "entity_type": "EVALUATION_METRIC", + "description": "best results refer to the top performing outcomes marked in bold in the table", + "source_ids": [ + 153 + ], + "id": "Name: best results\nType: EVALUATION_METRIC" + }, + { + "entity_name": "second best results", + "entity_type": "EVALUATION_METRIC", + "description": "second best results refer to the runner up outcomes marked in underlined in the table", + "source_ids": [ + 153 + ], + "id": "Name: second best results\nType: EVALUATION_METRIC" + }, + { + "entity_name": "bold", + "entity_type": "COLOR", + "description": "bold refers to the text formatting style used to mark the best results in the table", + "source_ids": [ + 153 + ], + "id": "Name: bold\nType: COLOR" + }, + { + "entity_name": "underlined", + "entity_type": "SHAPE", + "description": "underlined refers to the text formatting style used to mark the second best results in the table", + "source_ids": [ + 153 + ], + "id": "Name: underlined\nType: SHAPE" + }, + { + "entity_name": "table: cref='#/texts/156'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/156'", + "source_ids": [ + 154 + ], + "id": "Name: table: cref='#/texts/156'...\nType: TABLE" + }, + { + "entity_name": "cref", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "cref is a reference identifier or cross-reference key found in the description text, pointing to specific text locations such as '#/texts/156' and '#/texts/220'.", + "source_ids": [ + 154, + 171 + ], + "id": "Name: cref\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "table 6", + "entity_type": "TABLE", + "description": "table 6 is a table presenting a comparison of retrieval recall among layout based methods", + "source_ids": [ + 155 + ], + "id": "Name: table 6\nType: TABLE" + }, + { + "entity_name": "layout based methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "layout based methods are the techniques being evaluated for their retrieval recall performance", + "source_ids": [ + 155 + ], + "id": "Name: layout based methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "cref='#/texts/158'", + "entity_type": "TABLE", + "description": "A table entity identified by the reference string provided in the description, representing a specific text section or data block.", + "source_ids": [ + 156 + ], + "id": "Name: cref='#/texts/158'\nType: TABLE" + }, + { + "entity_name": "ift inspired selector reasoner workflow", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the ift inspired selector reasoner workflow is the process used by bookrag to classify queries and analyze information", + "source_ids": [ + 157 + ], + "id": "Name: ift inspired selector reasoner workflow\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "71 2", + "entity_type": "PERCENTAGE", + "description": "71 2 is the retrieval recall achieved by bookrag on the m3docvqa dataset", + "source_ids": [ + 157 + ], + "id": "Name: 71 2\nType: PERCENTAGE" + }, + { + "entity_name": "44 5", + "entity_type": "PERCENTAGE", + "description": "44 5 is the maximum retrieval recall achieved by the graphranker baseline", + "source_ids": [ + 157 + ], + "id": "Name: 44 5\nType: PERCENTAGE" + }, + { + "entity_name": "9 87", + "entity_type": "MEASUREMENT", + "description": "9 87 is the average number of retained nodes on one of the three datasets after the skyline ranker process", + "source_ids": [ + 157 + ], + "id": "Name: 9 87\nType: MEASUREMENT" + }, + { + "entity_name": "6 86", + "entity_type": "MEASUREMENT", + "description": "6 86 is the average number of retained nodes on another of the three datasets after the skyline ranker process", + "source_ids": [ + 157 + ], + "id": "Name: 6 86\nType: MEASUREMENT" + }, + { + "entity_name": "8 6", + "entity_type": "MEASUREMENT", + "description": "8 6 is the average number of retained nodes on the third dataset after the skyline ranker process", + "source_ids": [ + 157 + ], + "id": "Name: 8 6\nType: MEASUREMENT" + }, + { + "entity_name": "10", + "entity_type": "MEASUREMENT", + "description": "The value 10 serves multiple roles depending on the context: it is the standard top k setting used for comparison, a numerical measurement or count, the issue number of a publication volume, the retrieval top k value configured to maintain consistent candidate pool sizes, and the ending page number in a given example range.", + "source_ids": [ + 161, + 258, + 196, + 238, + 157 + ], + "id": "Name: 10\nType: MEASUREMENT" + }, + { + "entity_name": "retrieval performance", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval performance is the specific metric being evaluated to validate the retrieval design of bookrag", + "source_ids": [ + 157 + ], + "id": "Name: retrieval performance\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "ground truth layout blocks", + "entity_type": "DATASET_OR_CORPUS", + "description": "ground truth layout blocks are the reference data used to evaluate the retrieval recall", + "source_ids": [ + 157 + ], + "id": "Name: ground truth layout blocks\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "layout based baselines", + "entity_type": "PRODUCT", + "description": "layout based baselines are the group of systems against which bookrag is compared", + "source_ids": [ + 157 + ], + "id": "Name: layout based baselines\nType: PRODUCT" + }, + { + "entity_name": "information patch", + "entity_type": "TASK_OR_PROBLEM", + "description": "the information patch is the precise data segment targeted by the selector component", + "source_ids": [ + 157 + ], + "id": "Name: information patch\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "candidate size", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "candidate size is the variable representing the number of candidates which is kept from inflating by the skyline ranker process", + "source_ids": [ + 157 + ], + "id": "Name: candidate size\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "three datasets", + "entity_type": "DATASET_OR_CORPUS", + "description": "three datasets are the collective group of data used to measure the average number of retained nodes", + "source_ids": [ + 157 + ], + "id": "Name: three datasets\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "standard top k setting", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the standard top k setting is the baseline configuration used for comparison with the skyline ranker results", + "source_ids": [ + 157 + ], + "id": "Name: standard top k setting\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "figure 5", + "entity_type": "IMAGE", + "description": "Figure 5 is a visual illustration that presents a comparison of query efficiency for various Retrieval-Augmented Generation (RAG) methods across three datasets, specifically evaluating BookRAG in terms of query time and token consumption.", + "source_ids": [ + 160, + 158, + 159 + ], + "id": "Name: figure 5\nType: IMAGE" + }, + { + "entity_name": "query efficiency", + "entity_type": "EVALUATION_METRIC", + "description": "query efficiency is a metric being compared in the text", + "source_ids": [ + 158 + ], + "id": "Name: query efficiency\nType: EVALUATION_METRIC" + }, + { + "entity_name": "bm25", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A classical probabilistic ranking function used for information retrieval, listed in the chart legend.", + "source_ids": [ + 159 + ], + "id": "Name: bm25\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "vanilla rag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The baseline Retrieval-Augmented Generation model without additional enhancements, listed in the chart legend.", + "source_ids": [ + 159 + ], + "id": "Name: vanilla rag\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "layout + vanilla", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A variant of the vanilla RAG method that incorporates layout information, listed in the chart legend.", + "source_ids": [ + 159 + ], + "id": "Name: layout + vanilla\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "raptor", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Recursive Abstractive Processing for Tree-Organized Retrieval, a specific RAG approach listed in the chart legend.", + "source_ids": [ + 159 + ], + "id": "Name: raptor\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "graphrag-local", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A local graph-based retrieval method, listed in the chart legend.", + "source_ids": [ + 159 + ], + "id": "Name: graphrag-local\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "graphrag-global", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A global graph-based retrieval method, listed in the chart legend.", + "source_ids": [ + 159 + ], + "id": "Name: graphrag-global\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "mm-vanilla", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A multi-modal vanilla RAG baseline, listed in the chart legend.", + "source_ids": [ + 159 + ], + "id": "Name: mm-vanilla\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "tree-traverse", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A tree-traversal based retrieval or processing method, listed in the chart legend.", + "source_ids": [ + 159 + ], + "id": "Name: tree-traverse\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "query time", + "entity_type": "EVALUATION_METRIC", + "description": "A performance metric measuring the time taken to process a query, displayed on the x-axis of the left charts.", + "source_ids": [ + 159 + ], + "id": "Name: query time\nType: EVALUATION_METRIC" + }, + { + "entity_name": "token cost", + "entity_type": "EVALUATION_METRIC", + "description": "A performance metric measuring the number of tokens consumed, displayed on the x-axis of the right charts.", + "source_ids": [ + 159 + ], + "id": "Name: token cost\nType: EVALUATION_METRIC" + }, + { + "entity_name": "time (s)", + "entity_type": "MEASUREMENT", + "description": "The unit of measurement for the y-axis in the Query Time charts, representing seconds.", + "source_ids": [ + 159 + ], + "id": "Name: time (s)\nType: MEASUREMENT" + }, + { + "entity_name": "token (m)", + "entity_type": "MEASUREMENT", + "description": "The unit of measurement for the y-axis in the Token cost charts, representing millions of tokens.", + "source_ids": [ + 159 + ], + "id": "Name: token (m)\nType: MEASUREMENT" + }, + { + "entity_name": "image cref='#/texts/161'", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 159 + ], + "id": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "entity_name": "graph based rag methods", + "entity_type": "TECHNOLOGY", + "description": "graph based rag methods are existing methods used as a baseline for comparing bookrag s efficiency", + "source_ids": [ + 160 + ], + "id": "Name: graph based rag methods\nType: TECHNOLOGY" + }, + { + "entity_name": "text based rag approaches", + "entity_type": "TECHNOLOGY", + "description": "text based rag approaches are methods that generally exhibit lower latency and token usage due to the absence of vlm processing", + "source_ids": [ + 160 + ], + "id": "Name: text based rag approaches\nType: TECHNOLOGY" + }, + { + "entity_name": "vlm", + "entity_type": "TECHNOLOGY", + "description": "vlm refers to vision language models the processing component absent in purely text based rag approaches", + "source_ids": [ + 160 + ], + "id": "Name: vlm\nType: TECHNOLOGY" + }, + { + "entity_name": "docetl", + "entity_type": "PRODUCT", + "description": "docetl is a baseline method against which bookrag s token consumption and query latency are compared", + "source_ids": [ + 160 + ], + "id": "Name: docetl\nType: PRODUCT" + }, + { + "entity_name": "53 million tokens", + "entity_type": "MEASUREMENT", + "description": "53 million tokens is the amount of token consumption recorded for docetl on the mmlongbench dataset", + "source_ids": [ + 160 + ], + "id": "Name: 53 million tokens\nType: MEASUREMENT" + }, + { + "entity_name": "5 million", + "entity_type": "MEASUREMENT", + "description": "5 million is the upper limit of token consumption required by bookrag on the mmlongbench dataset", + "source_ids": [ + 160 + ], + "id": "Name: 5 million\nType: MEASUREMENT" + }, + { + "entity_name": "order of magnitude", + "entity_type": "MEASUREMENT", + "description": "order of magnitude describes the scale of reduction in token consumption by bookrag compared to docetl", + "source_ids": [ + 160 + ], + "id": "Name: order of magnitude\nType: MEASUREMENT" + }, + { + "entity_name": "6.3 detailed analysis", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experiments' within the BookRAG paper, this section provides an in-depth comparative analysis of the proposed method against strong baseline methods, specifically focusing on efficiency and accuracy metrics for document QA tasks.", + "source_ids": [ + 162 + ], + "id": "Name: 6.3 detailed analysis\nType: SECTION_TITLE" + }, + { + "entity_name": "ablation study", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "ablation study is a method used to validate the contribution of each component of bookrag", + "source_ids": [ + 163 + ], + "id": "Name: ablation study\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "gradient based er", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "gradient based er is a method used in experiments to analyze its impact on qa performance", + "source_ids": [ + 163 + ], + "id": "Name: gradient based er\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "qa performance", + "entity_type": "EVALUATION_METRIC", + "description": "qa performance is the metric being evaluated in the experiments across different query types", + "source_ids": [ + 163 + ], + "id": "Name: qa performance\nType: EVALUATION_METRIC" + }, + { + "entity_name": "entity resolution method", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "entity resolution method is a technique compared for effectiveness in the text", + "source_ids": [ + 163 + ], + "id": "Name: entity resolution method\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "case study", + "entity_type": "TASK_OR_PROBLEM", + "description": "A case study is a specific analysis presented in the text, serving as the context or type of analysis being presented.", + "source_ids": [ + 186, + 163 + ], + "id": "Name: case study\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "query types", + "entity_type": "TASK_OR_PROBLEM", + "description": "Query types are the different categories of questions used to evaluate the performance of systems and the quality of responses in experiments and case studies. These categories include single hop, multi hop, and global queries, which are specifically analyzed to assess QA capabilities.", + "source_ids": [ + 177, + 163, + 179, + 181 + ], + "id": "Name: query types\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "error analysis", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "error analysis is a comprehensive method performed to examine the results of the study", + "source_ids": [ + 163 + ], + "id": "Name: error analysis\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "ablation study", + "entity_type": "TASK_OR_PROBLEM", + "description": "ablation study is a task designed to evaluate the contribution of core components in bookrag", + "source_ids": [ + 164 + ], + "id": "Name: ablation study\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "gradient er", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Gradient ER is a gradient-based entity resolution method mentioned in the text, and it serves as a specific component whose removal in the WO variant highlights the role of the knowledge graph.", + "source_ids": [ + 172, + 165 + ], + "id": "Name: gradient er\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "basic er", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "basic er is a method used to merge same name entities replacing gradient er in the described scenario", + "source_ids": [ + 165 + ], + "id": "Name: basic er\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "w o gradient er", + "entity_type": "TASK_OR_PROBLEM", + "description": "w o gradient er is a scenario or condition described where the gradient based entity resolution is replaced", + "source_ids": [ + 165 + ], + "id": "Name: w o gradient er\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "same name entities", + "entity_type": "TASK_OR_PROBLEM", + "description": "same name entities are the specific entities targeted for merging in the basic er process", + "source_ids": [ + 165 + ], + "id": "Name: same name entities\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "static standard workflow", + "entity_type": "TASK_OR_PROBLEM", + "description": "static standard workflow is the default process used for all queries when agent based planning is removed", + "source_ids": [ + 166 + ], + "id": "Name: static standard workflow\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "reasoners", + "entity_type": "TECHNOLOGY", + "description": "reasoners are systems or components that score candidate nodes affected by the removal of selector operators", + "source_ids": [ + 167 + ], + "id": "Name: reasoners\nType: TECHNOLOGY" + }, + { + "entity_name": "candidate nodes", + "entity_type": "TASK_OR_PROBLEM", + "description": "candidate nodes are the items being scored by reasoners when the selector operators are removed", + "source_ids": [ + 167 + ], + "id": "Name: candidate nodes\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "selector operators", + "entity_type": "TECHNOLOGY", + "description": "selector operators are specific components that can be removed to alter the behavior of reasoners", + "source_ids": [ + 167 + ], + "id": "Name: selector operators\nType: TECHNOLOGY" + }, + { + "entity_name": "graph reasoning", + "entity_type": "TECHNOLOGY", + "description": "graph reasoning is an operator that when removed disables the skyline ranker", + "source_ids": [ + 168 + ], + "id": "Name: graph reasoning\nType: TECHNOLOGY" + }, + { + "entity_name": "text reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "text reasoning is an operator that is removed in the described scenario", + "source_ids": [ + 169 + ], + "id": "Name: text reasoning\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "table 7", + "entity_type": "TABLE", + "description": "Table 7 is a reference in the text that compares the QA performance of different variants of BookRAG, illustrating performance degradation across these variants.", + "source_ids": [ + 170, + 172 + ], + "id": "Name: table 7\nType: TABLE" + }, + { + "entity_name": "table: cref='#/texts/220'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/220'", + "source_ids": [ + 171 + ], + "id": "Name: table: cref='#/texts/220'...\nType: TABLE" + }, + { + "entity_name": "kg", + "entity_type": "DATASET_OR_CORPUS", + "description": "kg refers to a knowledge graph used to support effective reasoning in the bookrag system", + "source_ids": [ + 172 + ], + "id": "Name: kg\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "ift inspired selection mechanism", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "ift inspired selection mechanism is a strategy evaluated for its role in the system s efficiency", + "source_ids": [ + 172 + ], + "id": "Name: ift inspired selection mechanism\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "multi dimensional reasoning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "multi dimensional reasoning is a strategy validated for its effectiveness in the system", + "source_ids": [ + 172 + ], + "id": "Name: multi dimensional reasoning\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "dynamic skyline filtering strategy", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "dynamic skyline filtering strategy is a method validated for its effectiveness in the system", + "source_ids": [ + 172 + ], + "id": "Name: dynamic skyline filtering strategy\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "w o gradient er variant", + "entity_type": "TASK_OR_PROBLEM", + "description": "the w o gradient er variant is a specific configuration used to test the role of the knowledge graph", + "source_ids": [ + 172 + ], + "id": "Name: w o gradient er variant\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "planning mechanism", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the planning mechanism is a component whose removal causes significant performance loss", + "source_ids": [ + 172 + ], + "id": "Name: planning mechanism\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "w o selector variant", + "entity_type": "TASK_OR_PROBLEM", + "description": "the w o selector variant is a configuration used to validate the efficiency of the selection strategy", + "source_ids": [ + 172 + ], + "id": "Name: w o selector variant\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "tokens", + "entity_type": "MEASUREMENT", + "description": "tokens are the unit of measurement used to quantify computational cost", + "source_ids": [ + 172 + ], + "id": "Name: tokens\nType: MEASUREMENT" + }, + { + "entity_name": "selector", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the selector is a component whose removal in the w o variant validates the efficiency of the selection strategy", + "source_ids": [ + 172 + ], + "id": "Name: selector\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "narrow then reason strategy", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the narrow then reason strategy is the specific approach inspired by ift that is being validated for efficiency", + "source_ids": [ + 172 + ], + "id": "Name: narrow then reason strategy\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "static workflow", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "a static workflow is described as insufficient for handling diverse types of queries contrasting with the dynamic approach", + "source_ids": [ + 172 + ], + "id": "Name: static workflow\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "retrieval performance", + "entity_type": "EVALUATION_METRIC", + "description": "retrieval performance is the metric used to evaluate the impact of kg quality", + "source_ids": [ + 172 + ], + "id": "Name: retrieval performance\nType: EVALUATION_METRIC" + }, + { + "entity_name": "computational cost", + "entity_type": "MEASUREMENT", + "description": "computational cost is a metric measured in tokens to evaluate the efficiency of the variants", + "source_ids": [ + 172 + ], + "id": "Name: computational cost\nType: MEASUREMENT" + }, + { + "entity_name": "performance degradation", + "entity_type": "EVALUATION_METRIC", + "description": "performance degradation is the observed outcome across all variants confirming the essential role of each module", + "source_ids": [ + 172 + ], + "id": "Name: performance degradation\nType: EVALUATION_METRIC" + }, + { + "entity_name": "performance loss", + "entity_type": "EVALUATION_METRIC", + "description": "performance loss is the significant drop observed when the planning mechanism is removed", + "source_ids": [ + 172 + ], + "id": "Name: performance loss\nType: EVALUATION_METRIC" + }, + { + "entity_name": "11", + "entity_type": "NUMBER", + "description": "11 is a number mentioned in the text though its specific context or role is not defined", + "source_ids": [ + 173 + ], + "id": "Name: 11\nType: NUMBER" + }, + { + "entity_name": "figure 6", + "entity_type": "IMAGE", + "description": "Figure 6 is an image that presents the comparative results of an evaluation between two methods, specifically comparing graph statistics with values normalized to a basic setting.", + "source_ids": [ + 176, + 174 + ], + "id": "Name: figure 6\nType: IMAGE" + }, + { + "entity_name": "basic setting", + "entity_type": "TASK_OR_PROBLEM", + "description": "the basic setting serves as the baseline 1 0 for normalizing graph statistics values", + "source_ids": [ + 174 + ], + "id": "Name: basic setting\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "3 6e 3", + "entity_type": "MEASUREMENT", + "description": "3 6e 3 is an abbreviated density value representing 3 6 10 3", + "source_ids": [ + 174 + ], + "id": "Name: 3 6e 3\nType: MEASUREMENT" + }, + { + "entity_name": "graph statistics", + "entity_type": "TASK_OR_PROBLEM", + "description": "graph statistics are the subject of comparison in figure 6", + "source_ids": [ + 174 + ], + "id": "Name: graph statistics\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "absolute values", + "entity_type": "MEASUREMENT", + "description": "absolute values for the basic setting are annotated in the text", + "source_ids": [ + 174 + ], + "id": "Name: absolute values\nType: MEASUREMENT" + }, + { + "entity_name": "density values", + "entity_type": "MEASUREMENT", + "description": "density values are a specific type of metric mentioned that are abbreviated in the text", + "source_ids": [ + 174 + ], + "id": "Name: density values\nType: MEASUREMENT" + }, + { + "entity_name": "cref='#/texts/224'", + "entity_type": "IMAGE", + "description": "A figure containing two bar charts comparing 'Basic' and 'Gradient-based ER' performance metrics across '# Entity', 'Density', 'Diameter', and '# CC' for MMLongBench and Qasper datasets.", + "source_ids": [ + 175 + ], + "id": "Name: cref='#/texts/224'\nType: IMAGE" + }, + { + "entity_name": "basic", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The baseline method represented by blue bars in the legend, used as a comparison point against the Gradient-based ER approach.", + "source_ids": [ + 175 + ], + "id": "Name: basic\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "gradient-based er", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The proposed or specific method represented by red bars in the legend, evaluated on various metrics against the Basic model.", + "source_ids": [ + 175 + ], + "id": "Name: gradient-based er\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "ratio", + "entity_type": "EVALUATION_METRIC", + "description": "The Y-axis label indicating the metric being measured, representing the ratio of performance between the compared methods.", + "source_ids": [ + 175 + ], + "id": "Name: ratio\nType: EVALUATION_METRIC" + }, + { + "entity_name": "# entity", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A metric measuring the number of entities, shown as the first category of bars in both charts.", + "source_ids": [ + 175 + ], + "id": "Name: # entity\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "density", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A metric measuring graph density, showing significant variation between the Basic and Gradient-based ER methods.", + "source_ids": [ + 175 + ], + "id": "Name: density\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "diameter", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A metric measuring the longest shortest path in the graph, presented as the third category of bars.", + "source_ids": [ + 175 + ], + "id": "Name: diameter\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "# cc", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A metric likely representing the number of Connected Components, shown as the fourth category of bars.", + "source_ids": [ + 175 + ], + "id": "Name: # cc\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "figure (a)", + "entity_type": "SECTION_TITLE", + "description": "The label identifying the left-hand chart which displays results for the MMLongBench dataset.", + "source_ids": [ + 175 + ], + "id": "Name: figure (a)\nType: SECTION_TITLE" + }, + { + "entity_name": "figure (b)", + "entity_type": "SECTION_TITLE", + "description": "The label identifying the right-hand chart which displays results for the Qasper dataset.", + "source_ids": [ + 175 + ], + "id": "Name: figure (b)\nType: SECTION_TITLE" + }, + { + "entity_name": "1327", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the '# Entity' bar for the Basic method in chart (a).", + "source_ids": [ + 175 + ], + "id": "Name: 1327\nType: MEASUREMENT" + }, + { + "entity_name": "3.6e-3", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the 'Density' bar for the Basic method in chart (a).", + "source_ids": [ + 175 + ], + "id": "Name: 3.6e-3\nType: MEASUREMENT" + }, + { + "entity_name": "14.8", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the 'Diameter' bar for the Basic method in chart (a).", + "source_ids": [ + 175 + ], + "id": "Name: 14.8\nType: MEASUREMENT" + }, + { + "entity_name": "169", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the '# CC' bar for the Basic method in chart (a).", + "source_ids": [ + 175 + ], + "id": "Name: 169\nType: MEASUREMENT" + }, + { + "entity_name": "531", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the '# Entity' bar for the Basic method in chart (b).", + "source_ids": [ + 175 + ], + "id": "Name: 531\nType: MEASUREMENT" + }, + { + "entity_name": "5.4e-3", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the 'Density' bar for the Basic method in chart (b).", + "source_ids": [ + 175 + ], + "id": "Name: 5.4e-3\nType: MEASUREMENT" + }, + { + "entity_name": "15.0", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the 'Diameter' bar for the Basic method in chart (b).", + "source_ids": [ + 175 + ], + "id": "Name: 15.0\nType: MEASUREMENT" + }, + { + "entity_name": "106", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the '# CC' bar for the Basic method in chart (b).", + "source_ids": [ + 175 + ], + "id": "Name: 106\nType: MEASUREMENT" + }, + { + "entity_name": "gradient based entity resolution", + "entity_type": "TASK_OR_PROBLEM", + "description": "gradient based entity resolution is a method used to evaluate the quality of a constructed knowledge graph kg", + "source_ids": [ + 176 + ], + "id": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "basic kg construction", + "entity_type": "TASK_OR_PROBLEM", + "description": "basic kg construction is a standard practice using simple exact name matching for entity merging", + "source_ids": [ + 176 + ], + "id": "Name: basic kg construction\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "entity count", + "entity_type": "EVALUATION_METRIC", + "description": "entity count is a metric used to measure the number of entities in the graph", + "source_ids": [ + 176 + ], + "id": "Name: entity count\nType: EVALUATION_METRIC" + }, + { + "entity_name": "density", + "entity_type": "EVALUATION_METRIC", + "description": "density is a metric used to measure the connectivity of the graph", + "source_ids": [ + 176 + ], + "id": "Name: density\nType: EVALUATION_METRIC" + }, + { + "entity_name": "diameter of the largest connected component", + "entity_type": "EVALUATION_METRIC", + "description": "diameter of the largest connected component is a metric measuring the longest shortest path in the largest connected part of the graph", + "source_ids": [ + 176 + ], + "id": "Name: diameter of the largest connected component\nType: EVALUATION_METRIC" + }, + { + "entity_name": "number of connected components", + "entity_type": "EVALUATION_METRIC", + "description": "number of connected components is a metric counting the separate parts of the graph", + "source_ids": [ + 176 + ], + "id": "Name: number of connected components\nType: EVALUATION_METRIC" + }, + { + "entity_name": "basic baseline", + "entity_type": "BENCHMARK", + "description": "the basic baseline serves as the standard for comparison in the evaluation of the gradient based er method", + "source_ids": [ + 176 + ], + "id": "Name: basic baseline\nType: BENCHMARK" + }, + { + "entity_name": "er module", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the er module is the component responsible for identifying conceptual entities with different names", + "source_ids": [ + 176 + ], + "id": "Name: er module\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "12", + "entity_type": "PERCENTAGE", + "description": "12 is the percentage reduction in the number of entities achieved by the gradient based er method", + "source_ids": [ + 176 + ], + "id": "Name: 12\nType: PERCENTAGE" + }, + { + "entity_name": "many graph based methods", + "entity_type": "ORGANIZATION", + "description": "many graph based methods are a group of techniques that employ simple exact name matching for entity merging", + "source_ids": [ + 176 + ], + "id": "Name: many graph based methods\nType: ORGANIZATION" + }, + { + "entity_name": "graph reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "graph reasoning is a task facilitated by the improved connectivity of the resulting graphs", + "source_ids": [ + 176 + ], + "id": "Name: graph reasoning\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "figure 7", + "entity_type": "IMAGE", + "description": "Figure 7 is an image presenting a performance breakdown of QA by different query types and serves as a visual representation that breaks down the performance of BookRag.", + "source_ids": [ + 177, + 179 + ], + "id": "Name: figure 7\nType: IMAGE" + }, + { + "entity_name": "multi hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "Multi hop is a type of query used in the QA performance breakdown and a query case handled by BookRag's answering workflow. It is a task that requires decomposition into multiple simple sub-questions.", + "source_ids": [ + 177, + 186, + 247 + ], + "id": "Name: multi hop\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "global", + "entity_type": "TASK_OR_PROBLEM", + "description": "Global is a type of query used in QA performance breakdown and one of the three categories used to classify user questions. It represents a configuration category on the X-axis that denotes a global or holistic setting, specifically referring to questions that require an aggregation operation over a set of items identified by a structural filter.", + "source_ids": [ + 177, + 178, + 250, + 241 + ], + "id": "Name: global\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "blue bars", + "entity_type": "IMAGE", + "description": "blue bars represent the visual elements in the figure corresponding to exact match and accuracy metrics", + "source_ids": [ + 177 + ], + "id": "Name: blue bars\nType: IMAGE" + }, + { + "entity_name": "red bars", + "entity_type": "IMAGE", + "description": "red bars represent the visual elements in the figure corresponding to the f1 score metric", + "source_ids": [ + 177 + ], + "id": "Name: red bars\nType: IMAGE" + }, + { + "entity_name": "cref='#/texts/259'", + "entity_type": "IMAGE", + "description": "A figure containing two bar charts comparing EM/Accuracy and F1-score across Single, Multi, and Global configurations for MMLongBench and Qasper datasets.", + "source_ids": [ + 178 + ], + "id": "Name: cref='#/texts/259'\nType: IMAGE" + }, + { + "entity_name": "em / accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "Evaluation metric represented by blue bars in the chart, standing for Exact Match or Accuracy.", + "source_ids": [ + 178 + ], + "id": "Name: em / accuracy\nType: EVALUATION_METRIC" + }, + { + "entity_name": "f1-score", + "entity_type": "EVALUATION_METRIC", + "description": "Evaluation metric represented by red bars in the chart, representing the harmonic mean of precision and recall.", + "source_ids": [ + 178 + ], + "id": "Name: f1-score\nType: EVALUATION_METRIC" + }, + { + "entity_name": "single", + "entity_type": "TASK_OR_PROBLEM", + "description": "A configuration category on the X-axis representing a single-task or single-passage setting.", + "source_ids": [ + 178 + ], + "id": "Name: single\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "multi", + "entity_type": "TASK_OR_PROBLEM", + "description": "A configuration category on the X-axis representing a multi-task or multi-passage setting.", + "source_ids": [ + 178 + ], + "id": "Name: multi\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "(a) mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "MMLongBench is the first dataset evaluated in the left chart, focusing on long-context benchmarks, and the left diagram illustrates the breakdown of query processing results for this dataset.", + "source_ids": [ + 184, + 178 + ], + "id": "Name: (a) mmlongbench\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "(b) qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "The Qasper dataset, formally known as Question Answering in Scientific Papers with Reasoning, is the second dataset evaluated in the right chart and is illustrated in the right diagram, which shows the breakdown of query processing results for this dataset.", + "source_ids": [ + 184, + 178 + ], + "id": "Name: (b) qasper\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "multihop", + "entity_type": "TASK_OR_PROBLEM", + "description": "multihop is a type of query that presents a greater challenge compared to single hop queries", + "source_ids": [ + 179 + ], + "id": "Name: multihop\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "agent based planning strategy", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the agent based planning strategy is a method used to handle different query types separately", + "source_ids": [ + 179 + ], + "id": "Name: agent based planning strategy\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "disjoint pieces of evidence", + "entity_type": "DATASET_OR_CORPUS", + "description": "disjoint pieces of evidence are the fragmented information sources that make reasoning difficult", + "source_ids": [ + 179 + ], + "id": "Name: disjoint pieces of evidence\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "retrieving", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "retrieving is the process of finding information identified as a challenge in the text", + "source_ids": [ + 179 + ], + "id": "Name: retrieving\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "figure 9", + "entity_type": "IMAGE", + "description": "Figure 9 is an image presenting an error analysis on sampled queries, visually representing the error propagation traced during the analysis.", + "source_ids": [ + 180, + 183 + ], + "id": "Name: figure 9\nType: IMAGE" + }, + { + "entity_name": "error response analysis", + "entity_type": "TASK_OR_PROBLEM", + "description": "error response analysis is the specific task conducted to diagnose performance bottlenecks", + "source_ids": [ + 180 + ], + "id": "Name: error response analysis\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "200 sampled queries", + "entity_type": "MEASUREMENT", + "description": "200 sampled queries refers to the quantity of queries from each dataset used for the analysis", + "source_ids": [ + 180 + ], + "id": "Name: 200 sampled queries\nType: MEASUREMENT" + }, + { + "entity_name": "four types", + "entity_type": "MEASUREMENT", + "description": "four types refers to the number of categories into which failures are classified", + "source_ids": [ + 180 + ], + "id": "Name: four types\nType: MEASUREMENT" + }, + { + "entity_name": "figure 8", + "entity_type": "IMAGE", + "description": "Figure 8 is an image presenting a case study of responses across different query types, illustrating BookRags' answering workflow for those various query types.", + "source_ids": [ + 186, + 181 + ], + "id": "Name: figure 8\nType: IMAGE" + }, + { + "entity_name": "cyan text", + "entity_type": "COLOR", + "description": "cyan text refers to the color used to highlight correct content generated by bookrag in the figure", + "source_ids": [ + 181 + ], + "id": "Name: cyan text\nType: COLOR" + }, + { + "entity_name": "gray text", + "entity_type": "COLOR", + "description": "gray text refers to the color used to describe the internal process in the figure", + "source_ids": [ + 181 + ], + "id": "Name: gray text\nType: COLOR" + }, + { + "entity_name": "case study", + "entity_type": "EVENT", + "description": "case study is the specific analysis of responses across different query types presented in the text", + "source_ids": [ + 181 + ], + "id": "Name: case study\nType: EVENT" + }, + { + "entity_name": "internal process", + "entity_type": "TASK_OR_PROBLEM", + "description": "internal process refers to the underlying mechanisms described in the gray text", + "source_ids": [ + 181 + ], + "id": "Name: internal process\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "bookrag response of different query types", + "entity_type": "IMAGE", + "description": "A document illustrating BookRAG's responses to three distinct query types: Single-hop, Multi-hop, and Global Aggregation cases.", + "source_ids": [ + 182 + ], + "id": "Name: bookrag response of different query types\nType: IMAGE" + }, + { + "entity_name": "single-hop case from qasper", + "entity_type": "SECTION_TITLE", + "description": "The title of the first section detailing a single-hop query example involving a reward model for reinforcement learning.", + "source_ids": [ + 182 + ], + "id": "Name: single-hop case from qasper\nType: SECTION_TITLE" + }, + { + "entity_name": "select_by_entity operator", + "entity_type": "SOFTWARE", + "description": "An operator that identifies relevant sub-trees (e.g., Introduction, Related work) to prune the reasoning space.", + "source_ids": [ + 182 + ], + "id": "Name: select_by_entity operator\nType: SOFTWARE" + }, + { + "entity_name": "graph_reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "A reasoning step performed after the Select_by_Entity operator focuses on a specific scope.", + "source_ids": [ + 182 + ], + "id": "Name: graph_reasoning\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "text_reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "A reasoning step involved in retrieving nodes for the final response.", + "source_ids": [ + 182 + ], + "id": "Name: text_reasoning\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "skyline_ranker", + "entity_type": "SOFTWARE", + "description": "An operator used to retrieve 8 nodes for the final response based on focused scope.", + "source_ids": [ + 182 + ], + "id": "Name: skyline_ranker\nType: SOFTWARE" + }, + { + "entity_name": "binary reward system", + "entity_type": "TECHNOLOGY", + "description": "A system that evaluates the success or failure of dialog interactions with a discount factor.", + "source_ids": [ + 182 + ], + "id": "Name: binary reward system\nType: TECHNOLOGY" + }, + { + "entity_name": "discount factor", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A variable used in the reward model calculation, specifically noted as 0.95 in the text.", + "source_ids": [ + 182 + ], + "id": "Name: discount factor\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "multi-hop case from qasper", + "entity_type": "SECTION_TITLE", + "description": "The title of the second section detailing a multi-hop query comparing interpretable systems and LSTM models.", + "source_ids": [ + 182 + ], + "id": "Name: multi-hop case from qasper\nType: SECTION_TITLE" + }, + { + "entity_name": "interpretable system", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "A system type compared against LSTM-ELMo, utilizing vectors and cosine distance.", + "source_ids": [ + 182 + ], + "id": "Name: interpretable system\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "lstm with elmo system", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "A machine learning model mentioned in the comparison, achieving an accuracy of 0.6818.", + "source_ids": [ + 182 + ], + "id": "Name: lstm with elmo system\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "lstm-elmo net", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "Another reference to the Long Short-Term Memory network combined with ELMo embeddings.", + "source_ids": [ + 182 + ], + "id": "Name: lstm-elmo net\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "diacritic swapping", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A method mentioned as showing remarkably poor performance in the context of the experiment.", + "source_ids": [ + 182 + ], + "id": "Name: diacritic swapping\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "cross-entropy", + "entity_type": "EVALUATION_METRIC", + "description": "The loss measure used for the test results in the multi-hop query analysis.", + "source_ids": [ + 182 + ], + "id": "Name: cross-entropy\nType: EVALUATION_METRIC" + }, + { + "entity_name": "decompose operator", + "entity_type": "SOFTWARE", + "description": "An operator used in Agent-based Planning for multi-hop queries to break down the question.", + "source_ids": [ + 182 + ], + "id": "Name: decompose operator\nType: SOFTWARE" + }, + { + "entity_name": "global aggregation case from mmlongbench", + "entity_type": "SECTION_TITLE", + "description": "The title of the third section detailing a global query about counting charts in a document.", + "source_ids": [ + 182 + ], + "id": "Name: global aggregation case from mmlongbench\nType: SECTION_TITLE" + }, + { + "entity_name": "filter operators", + "entity_type": "SOFTWARE", + "description": "Operators applied to filter data based on specific criteria like page range or modality.", + "source_ids": [ + 182 + ], + "id": "Name: filter operators\nType: SOFTWARE" + }, + { + "entity_name": "filter_range", + "entity_type": "SOFTWARE", + "description": "A filter operator specifying a range of pages (e.g., '1-10') to search within.", + "source_ids": [ + 182 + ], + "id": "Name: filter_range\nType: SOFTWARE" + }, + { + "entity_name": "filter_modal", + "entity_type": "SOFTWARE", + "description": "A filter operator specifying the modality of content, such as 'image'.", + "source_ids": [ + 182 + ], + "id": "Name: filter_modal\nType: SOFTWARE" + }, + { + "entity_name": "reduce", + "entity_type": "SOFTWARE", + "description": "A process step that synthesizes the final output after analyzing images.", + "source_ids": [ + 182 + ], + "id": "Name: reduce\nType: SOFTWARE" + }, + { + "entity_name": "image cref='#/texts/282'", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 182 + ], + "id": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "entity_name": "200", + "entity_type": "MEASUREMENT", + "description": "200 is the number of sampled queries used in the error analysis", + "source_ids": [ + 183 + ], + "id": "Name: 200\nType: MEASUREMENT" + }, + { + "entity_name": "error analysis", + "entity_type": "TASK_OR_PROBLEM", + "description": "error analysis is the task being performed on the sampled queries from the datasets", + "source_ids": [ + 183 + ], + "id": "Name: error analysis\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "cref='#/texts/348'", + "entity_type": "IMAGE", + "description": "A figure containing two funnel diagrams comparing error analysis for the MMLongBench and Qasper datasets.", + "source_ids": [ + 184 + ], + "id": "Name: cref='#/texts/348'\nType: IMAGE" + }, + { + "entity_name": "all queries (200)", + "entity_type": "MEASUREMENT", + "description": "The initial total number of queries processed in both the MMLongBench and Qasper experiments.", + "source_ids": [ + 184 + ], + "id": "Name: all queries (200)\nType: MEASUREMENT" + }, + { + "entity_name": "successful parsing (194)", + "entity_type": "MEASUREMENT", + "description": "The count of queries that were successfully parsed within the MMLongBench experiment.", + "source_ids": [ + 184 + ], + "id": "Name: successful parsing (194)\nType: MEASUREMENT" + }, + { + "entity_name": "retrieval error (52)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to retrieval failures in the MMLongBench experiment.", + "source_ids": [ + 184 + ], + "id": "Name: retrieval error (52)\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "generation error (36)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to generation failures in the MMLongBench experiment.", + "source_ids": [ + 184 + ], + "id": "Name: generation error (36)\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "plan error (27)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to planning failures in the MMLongBench experiment.", + "source_ids": [ + 184 + ], + "id": "Name: plan error (27)\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "parsing error (6)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to parsing failures in the MMLongBench experiment.", + "source_ids": [ + 184 + ], + "id": "Name: parsing error (6)\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "correct (79)", + "entity_type": "EVALUATION_METRIC", + "description": "The final count of correctly answered queries in the MMLongBench experiment.", + "source_ids": [ + 184 + ], + "id": "Name: correct (79)\nType: EVALUATION_METRIC" + }, + { + "entity_name": "successful parsing (193)", + "entity_type": "MEASUREMENT", + "description": "The count of queries that were successfully parsed within the Qasper experiment.", + "source_ids": [ + 184 + ], + "id": "Name: successful parsing (193)\nType: MEASUREMENT" + }, + { + "entity_name": "generation error (30)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to generation failures in the Qasper experiment.", + "source_ids": [ + 184 + ], + "id": "Name: generation error (30)\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "retrieval error (26)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to retrieval failures in the Qasper experiment.", + "source_ids": [ + 184 + ], + "id": "Name: retrieval error (26)\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "plan error (20)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to planning failures in the Qasper experiment.", + "source_ids": [ + 184 + ], + "id": "Name: plan error (20)\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "parsing error (7)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to parsing failures in the Qasper experiment.", + "source_ids": [ + 184 + ], + "id": "Name: parsing error (7)\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "correct (117)", + "entity_type": "EVALUATION_METRIC", + "description": "The final count of correctly answered queries in the Qasper experiment.", + "source_ids": [ + 184 + ], + "id": "Name: correct (117)\nType: EVALUATION_METRIC" + }, + { + "entity_name": "pdf parsing", + "entity_type": "TASK_OR_PROBLEM", + "description": "pdf parsing is identified as a task or problem area within the context of the study", + "source_ids": [ + 185 + ], + "id": "Name: pdf parsing\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "retrieval error", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval error is the dominant failure mode identified in the results", + "source_ids": [ + 185 + ], + "id": "Name: retrieval error\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "generation error", + "entity_type": "TASK_OR_PROBLEM", + "description": "generation error is the second most common failure mode identified in the results", + "source_ids": [ + 185 + ], + "id": "Name: generation error\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "plan error", + "entity_type": "TASK_OR_PROBLEM", + "description": "plan error is a specific failure pattern where the planner over decomposes queries", + "source_ids": [ + 185 + ], + "id": "Name: plan error\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "multimodal evidence", + "entity_type": "TASK_OR_PROBLEM", + "description": "multimodal evidence is the type of information that is challenging to locate and synthesize", + "source_ids": [ + 185 + ], + "id": "Name: multimodal evidence\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "single hop queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "single hop queries are detailed queries that are incorrectly decomposed by the planner", + "source_ids": [ + 185 + ], + "id": "Name: single hop queries\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "multi hop sub tasks", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi hop sub tasks are unnecessary tasks created by the over decomposition of single hop queries", + "source_ids": [ + 185 + ], + "id": "Name: multi hop sub tasks\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "disjointed retrieval paths", + "entity_type": "TASK_OR_PROBLEM", + "description": "disjointed retrieval paths are the result of fragmentation preventing cohesive synthesis", + "source_ids": [ + 185 + ], + "id": "Name: disjointed retrieval paths\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "cohesive final answer", + "entity_type": "TASK_OR_PROBLEM", + "description": "cohesive final answer is the desired outcome that is prevented by disjointed retrieval paths", + "source_ids": [ + 185 + ], + "id": "Name: cohesive final answer\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "model", + "entity_type": "TASK_OR_PROBLEM", + "description": "the model is the entity attempting to synthesize answers from sub responses", + "source_ids": [ + 185 + ], + "id": "Name: model\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "planner", + "entity_type": "TASK_OR_PROBLEM", + "description": "the planner is the component that tends to over decompose queries", + "source_ids": [ + 185 + ], + "id": "Name: planner\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "qualitative analysis", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "qualitative analysis is the method used to reveal specific failure patterns", + "source_ids": [ + 185 + ], + "id": "Name: qualitative analysis\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "results", + "entity_type": "TASK_OR_PROBLEM", + "description": "the results are the findings that identify retrieval error as the dominant failure mode", + "source_ids": [ + 185 + ], + "id": "Name: results\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "fragmentation", + "entity_type": "TASK_OR_PROBLEM", + "description": "fragmentation is the process leading to disjointed retrieval paths", + "source_ids": [ + 185 + ], + "id": "Name: fragmentation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "scattered sub responses", + "entity_type": "TASK_OR_PROBLEM", + "description": "scattered sub responses are the outputs that fail to form a cohesive answer", + "source_ids": [ + 185 + ], + "id": "Name: scattered sub responses\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "global queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "global queries are a type of query case processed by bookrag s answering workflow", + "source_ids": [ + 186 + ], + "id": "Name: global queries\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "select", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "select is a specific operator leveraged by bookrag to prune search spaces", + "source_ids": [ + 186 + ], + "id": "Name: select\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "filter", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "filter is a specific operator leveraged by bookrag to prune search spaces", + "source_ids": [ + 186 + ], + "id": "Name: filter\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "134", + "entity_type": "MEASUREMENT", + "description": "134 represents the initial number of nodes in the reasoning space for the single hop case", + "source_ids": [ + 186 + ], + "id": "Name: 134\nType: MEASUREMENT" + }, + { + "entity_name": "24", + "entity_type": "MEASUREMENT", + "description": "24 represents the reduced number of nodes in the reasoning space for the single hop case", + "source_ids": [ + 186 + ], + "id": "Name: 24\nType: MEASUREMENT" + }, + { + "entity_name": "answering workflow", + "entity_type": "TASK_OR_PROBLEM", + "description": "answering workflow is the process illustrated by figure 8 that bookrag uses to handle queries", + "source_ids": [ + 186 + ], + "id": "Name: answering workflow\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "search spaces", + "entity_type": "TASK_OR_PROBLEM", + "description": "search spaces are the areas that bookrag prunes using specific operators to improve efficiency", + "source_ids": [ + 186 + ], + "id": "Name: search spaces\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "relevant evidence", + "entity_type": "TASK_OR_PROBLEM", + "description": "relevant evidence is the specific information that bookrag isolates from noise", + "source_ids": [ + 186 + ], + "id": "Name: relevant evidence\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "noise", + "entity_type": "TASK_OR_PROBLEM", + "description": "noise refers to irrelevant data from which bookrag isolates relevant evidence", + "source_ids": [ + 186 + ], + "id": "Name: noise\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "precise answer generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "precise answer generation is the outcome ensured by bookrag s ability to isolate relevant evidence", + "source_ids": [ + 186 + ], + "id": "Name: precise answer generation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "7 conclusion", + "entity_type": "SECTION_TITLE", + "description": "As the final substantive section of the paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section summarizes the key contributions, specifically the BookRAG framework and BookIndex structure, and highlights the state-of-the-art performance achieved in retrieval recall and QA accuracy.", + "source_ids": [ + 187 + ], + "id": "Name: 7 conclusion\nType: SECTION_TITLE" + }, + { + "entity_name": "book index", + "entity_type": "PRODUCT", + "description": "book index is a document native structured tree graph index designed to capture intricate relations of structural documents", + "source_ids": [ + 188 + ], + "id": "Name: book index\nType: PRODUCT" + }, + { + "entity_name": "agent based method", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "an agent based method is employed to dynamically configure retrieval and reasoning operators", + "source_ids": [ + 188 + ], + "id": "Name: agent based method\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "retrieval precision", + "entity_type": "EVALUATION_METRIC", + "description": "retrieval precision is a metric where the proposed approach demonstrates significant superiority over existing baselines", + "source_ids": [ + 188 + ], + "id": "Name: retrieval precision\nType: EVALUATION_METRIC" + }, + { + "entity_name": "answer accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "answer accuracy is a metric where the proposed approach demonstrates significant superiority over existing baselines", + "source_ids": [ + 188 + ], + "id": "Name: answer accuracy\nType: EVALUATION_METRIC" + }, + { + "entity_name": "benchmarks", + "entity_type": "BENCHMARK", + "description": "benchmarks are multiple tests on which the approach achieves state of the art performance", + "source_ids": [ + 188 + ], + "id": "Name: benchmarks\nType: BENCHMARK" + }, + { + "entity_name": "document native database system", + "entity_type": "PRODUCT", + "description": "a document native database system is a future exploration goal that supports data formatting knowledge extraction and intelligent querying", + "source_ids": [ + 188 + ], + "id": "Name: document native database system\nType: PRODUCT" + }, + { + "entity_name": "paper", + "entity_type": "PUBLICATION_VENUE", + "description": "the paper is the document in which the bookrag method is proposed", + "source_ids": [ + 188 + ], + "id": "Name: paper\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "tree graph index", + "entity_type": "TECHNOLOGY", + "description": "the tree graph index is the specific structure of the book index document native system", + "source_ids": [ + 188 + ], + "id": "Name: tree graph index\nType: TECHNOLOGY" + }, + { + "entity_name": "retrieval operators", + "entity_type": "SOFTWARE", + "description": "retrieval operators are components dynamically configured by the agent based method", + "source_ids": [ + 188 + ], + "id": "Name: retrieval operators\nType: SOFTWARE" + }, + { + "entity_name": "reasoning operators", + "entity_type": "SOFTWARE", + "description": "reasoning operators are components dynamically configured by the agent based method", + "source_ids": [ + 188 + ], + "id": "Name: reasoning operators\nType: SOFTWARE" + }, + { + "entity_name": "data formatting", + "entity_type": "TASK_OR_PROBLEM", + "description": "data formatting is a capability supported by the future document native database system", + "source_ids": [ + 188 + ], + "id": "Name: data formatting\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "knowledge extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "knowledge extraction is a capability supported by the future document native database system", + "source_ids": [ + 188 + ], + "id": "Name: knowledge extraction\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "intelligent querying", + "entity_type": "TASK_OR_PROBLEM", + "description": "intelligent querying is a capability supported by the future document native database system", + "source_ids": [ + 188 + ], + "id": "Name: intelligent querying\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "12", + "entity_type": "MEASUREMENT", + "description": "12 is a numerical value mentioned in the text potentially representing a count or measurement", + "source_ids": [ + 189 + ], + "id": "Name: 12\nType: MEASUREMENT" + }, + { + "entity_name": "references", + "entity_type": "SECTION_TITLE", + "description": "The references section, appearing as a top-level component following the main title \"BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents,\" functions as the bibliography for the paper by listing all cited works and sources that support the research presented. Additionally, it is recognized as a structural part of a document within the definition of section filters.", + "source_ids": [ + 258, + 190 + ], + "id": "Name: references\nType: SECTION_TITLE" + }, + { + "entity_name": "simran arora", + "entity_type": "PERSON", + "description": "simran arora is listed as one of the authors of the paper", + "source_ids": [ + 191 + ], + "id": "Name: simran arora\nType: PERSON" + }, + { + "entity_name": "brandon yang", + "entity_type": "PERSON", + "description": "brandon yang is listed as one of the authors of the paper", + "source_ids": [ + 191 + ], + "id": "Name: brandon yang\nType: PERSON" + }, + { + "entity_name": "sabri eyuboglu", + "entity_type": "PERSON", + "description": "sabri eyuboglu is listed as one of the authors of the paper", + "source_ids": [ + 191 + ], + "id": "Name: sabri eyuboglu\nType: PERSON" + }, + { + "entity_name": "avanika narayan", + "entity_type": "PERSON", + "description": "avanika narayan is listed as one of the authors of the paper", + "source_ids": [ + 191 + ], + "id": "Name: avanika narayan\nType: PERSON" + }, + { + "entity_name": "andrew hojel", + "entity_type": "PERSON", + "description": "andrew hojel is listed as one of the authors of the paper", + "source_ids": [ + 191 + ], + "id": "Name: andrew hojel\nType: PERSON" + }, + { + "entity_name": "immanuel trummer", + "entity_type": "PERSON", + "description": "immanuel trummer is listed as one of the authors of the paper", + "source_ids": [ + 191 + ], + "id": "Name: immanuel trummer\nType: PERSON" + }, + { + "entity_name": "christopher r", + "entity_type": "PERSON", + "description": "christopher r is listed as one of the authors of the paper", + "source_ids": [ + 191 + ], + "id": "Name: christopher r\nType: PERSON" + }, + { + "entity_name": "language models", + "entity_type": "TECHNOLOGY", + "description": "language models are the technology enabling the simple systems described in the paper", + "source_ids": [ + 191 + ], + "id": "Name: language models\nType: TECHNOLOGY" + }, + { + "entity_name": "heterogeneous data lakes", + "entity_type": "DATASET_OR_CORPUS", + "description": "heterogeneous data lakes are the type of data being structured by the systems in the paper", + "source_ids": [ + 191 + ], + "id": "Name: heterogeneous data lakes\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "17", + "entity_type": "MEASUREMENT", + "description": "17 refers to the volume number of the publication", + "source_ids": [ + 191 + ], + "id": "Name: 17\nType: MEASUREMENT" + }, + { + "entity_name": "simple systems", + "entity_type": "PRODUCT", + "description": "simple systems are the systems generated by language models as described in the paper title", + "source_ids": [ + 191 + ], + "id": "Name: simple systems\nType: PRODUCT" + }, + { + "entity_name": "structured views", + "entity_type": "PRODUCT", + "description": "structured views are the output generated for heterogeneous data lakes in the paper", + "source_ids": [ + 191 + ], + "id": "Name: structured views\nType: PRODUCT" + }, + { + "entity_name": "2023", + "entity_type": "DATE", + "description": "2023 is the year associated with the publication of the paper Self-RAG, the survey, and the corresponding arXiv preprint.", + "source_ids": [ + 193, + 200, + 205, + 207, + 191 + ], + "id": "Name: 2023\nType: DATE" + }, + { + "entity_name": "92 105", + "entity_type": "MEASUREMENT", + "description": "92 105 represents the page range of the article", + "source_ids": [ + 191 + ], + "id": "Name: 92 105\nType: MEASUREMENT" + }, + { + "entity_name": "akari asai", + "entity_type": "PERSON", + "description": "Akari Asai is an author of the 2023 paper titled \"Self-RAG.\"", + "source_ids": [ + 192, + 193 + ], + "id": "Name: akari asai\nType: PERSON" + }, + { + "entity_name": "zeqiu wu", + "entity_type": "PERSON", + "description": "Zeqiu Wu is an author of the 2023 paper titled \"Self-RAG\".", + "source_ids": [ + 192, + 193 + ], + "id": "Name: zeqiu wu\nType: PERSON" + }, + { + "entity_name": "yizhong wang", + "entity_type": "PERSON", + "description": "Yizhong Wang is an author of the 2023 paper titled \"Self-RAG\".", + "source_ids": [ + 192, + 193 + ], + "id": "Name: yizhong wang\nType: PERSON" + }, + { + "entity_name": "self rag", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "Self RAG is a method designed to learn retrieval, generation, and critique through the process of self-reflection.", + "source_ids": [ + 192, + 193 + ], + "id": "Name: self rag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "international conference on learning representations", + "entity_type": "PUBLICATION_VENUE", + "description": "international conference on learning representations iclr is the venue where the paper was published", + "source_ids": [ + 192 + ], + "id": "Name: international conference on learning representations\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "2024", + "entity_type": "DATE", + "description": "2024 is the year associated with the publication of the paper, the m3docrag preprint, the arxiv preprint, and the survey paper by the listed authors, serving as the date linked to the arxiv identifier and the authors' work.", + "source_ids": [ + 192, + 199, + 201, + 206, + 208, + 209, + 211, + 212, + 213, + 216 + ], + "id": "Name: 2024\nType: DATE" + }, + { + "entity_name": "et al", + "entity_type": "PERSON", + "description": "\"et al\" refers to additional authors of a paper who are not explicitly named or listed in the text.", + "source_ids": [ + 192, + 194, + 203, + 213 + ], + "id": "Name: et al\nType: PERSON" + }, + { + "entity_name": "iclr", + "entity_type": "PUBLICATION_VENUE", + "description": "iclr is the abbreviation for the international conference on learning representations where the paper was published", + "source_ids": [ + 192 + ], + "id": "Name: iclr\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "avirup sil", + "entity_type": "PERSON", + "description": "avirup sil is one of the authors of the 2023 paper titled self rag", + "source_ids": [ + 193 + ], + "id": "Name: avirup sil\nType: PERSON" + }, + { + "entity_name": "hannaneh hajishirzi", + "entity_type": "PERSON", + "description": "hannaneh hajishirzi is one of the authors of the 2023 paper titled self rag", + "source_ids": [ + 193 + ], + "id": "Name: hannaneh hajishirzi\nType: PERSON" + }, + { + "entity_name": "arxiv preprint arxiv 2310 11511", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint arxiv 2310 11511 is the specific identifier and venue for the publication of the paper", + "source_ids": [ + 193 + ], + "id": "Name: arxiv preprint arxiv 2310 11511\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "learning to retrieve generate and critique through self reflection", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "this is the specific technique described in the text that the self rag model learns to perform", + "source_ids": [ + 193 + ], + "id": "Name: learning to retrieve generate and critique through self reflection\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "arxiv", + "entity_type": "ORGANIZATION", + "description": "ArXiv is an organization and platform that hosts preprints, including the specific preprint arxiv 2302 09051.", + "source_ids": [ + 193, + 205, + 207 + ], + "id": "Name: arxiv\nType: ORGANIZATION" + }, + { + "entity_name": "shuai bai", + "entity_type": "PERSON", + "description": "shuai bai is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "id": "Name: shuai bai\nType: PERSON" + }, + { + "entity_name": "keqin chen", + "entity_type": "PERSON", + "description": "keqin chen is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "id": "Name: keqin chen\nType: PERSON" + }, + { + "entity_name": "xuejing liu", + "entity_type": "PERSON", + "description": "xuejing liu is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "id": "Name: xuejing liu\nType: PERSON" + }, + { + "entity_name": "jialin wang", + "entity_type": "PERSON", + "description": "jialin wang is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "id": "Name: jialin wang\nType: PERSON" + }, + { + "entity_name": "wenbin ge", + "entity_type": "PERSON", + "description": "wenbin ge is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "id": "Name: wenbin ge\nType: PERSON" + }, + { + "entity_name": "sibo song", + "entity_type": "PERSON", + "description": "sibo song is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "id": "Name: sibo song\nType: PERSON" + }, + { + "entity_name": "kai dang", + "entity_type": "PERSON", + "description": "kai dang is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "id": "Name: kai dang\nType: PERSON" + }, + { + "entity_name": "peng wang", + "entity_type": "PERSON", + "description": "peng wang is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "id": "Name: peng wang\nType: PERSON" + }, + { + "entity_name": "shijie wang", + "entity_type": "PERSON", + "description": "shijie wang is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "id": "Name: shijie wang\nType: PERSON" + }, + { + "entity_name": "jun tang", + "entity_type": "PERSON", + "description": "jun tang is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "id": "Name: jun tang\nType: PERSON" + }, + { + "entity_name": "qwen2 5 vl technical report", + "entity_type": "PUBLICATION_VENUE", + "description": "qwen2 5 vl technical report is the title of the document authored by the listed individuals", + "source_ids": [ + 194 + ], + "id": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "arxiv", + "entity_type": "PUBLICATION_VENUE", + "description": "ArXiv is a preprint server and platform where numerous papers and technical reports are published and hosted, including the Qwen2.5 VL technical report, a survey paper, the M3DocRAG preprint, and specific preprints such as arXiv:2404.16130 and arXiv:2403.14403.", + "source_ids": [ + 194, + 195, + 201, + 203, + 206, + 209, + 211, + 212, + 213 + ], + "id": "Name: arxiv\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "arxiv 2502 13923", + "entity_type": "FILE_TYPE", + "description": "arxiv 2502 13923 is the specific identifier for the preprint document", + "source_ids": [ + 194 + ], + "id": "Name: arxiv 2502 13923\nType: FILE_TYPE" + }, + { + "entity_name": "qwen2 5 vl", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "qwen2 5 vl is the specific model or architecture discussed in the technical report", + "source_ids": [ + 194 + ], + "id": "Name: qwen2 5 vl\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "preprint", + "entity_type": "FILE_TYPE", + "description": "A preprint indicates that the document is a preliminary version of a research paper.", + "source_ids": [ + 194, + 195 + ], + "id": "Name: preprint\nType: FILE_TYPE" + }, + { + "entity_name": "camille barboule", + "entity_type": "PERSON", + "description": "camille barboule is one of the authors of the 2025 survey on question answering over visually rich documents", + "source_ids": [ + 195 + ], + "id": "Name: camille barboule\nType: PERSON" + }, + { + "entity_name": "benjamin piwowarski", + "entity_type": "PERSON", + "description": "benjamin piwowarski is one of the authors of the 2025 survey on question answering over visually rich documents", + "source_ids": [ + 195 + ], + "id": "Name: benjamin piwowarski\nType: PERSON" + }, + { + "entity_name": "yoan chabot", + "entity_type": "PERSON", + "description": "yoan chabot is one of the authors of the 2025 survey on question answering over visually rich documents", + "source_ids": [ + 195 + ], + "id": "Name: yoan chabot\nType: PERSON" + }, + { + "entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "entity_type": "BOOK", + "description": "this is the title of the survey paper published in 2025", + "source_ids": [ + 195 + ], + "id": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK" + }, + { + "entity_name": "arxiv 2501 02235", + "entity_type": "FILE_TYPE", + "description": "arxiv 2501 02235 is the specific identifier for the preprint version of the survey", + "source_ids": [ + 195 + ], + "id": "Name: arxiv 2501 02235\nType: FILE_TYPE" + }, + { + "entity_name": "visually rich documents", + "entity_type": "DATASET_OR_CORPUS", + "description": "visually rich documents are the type of documents analyzed in the survey", + "source_ids": [ + 195 + ], + "id": "Name: visually rich documents\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "methods refers to the techniques discussed in the survey for handling visually rich documents", + "source_ids": [ + 195 + ], + "id": "Name: methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "challenges", + "entity_type": "TASK_OR_PROBLEM", + "description": "challenges refers to the difficulties identified in the field of question answering over visually rich documents", + "source_ids": [ + 195 + ], + "id": "Name: challenges\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "trends", + "entity_type": "RESEARCH_FIELD", + "description": "trends refers to the current directions and future outlooks in the research area", + "source_ids": [ + 195 + ], + "id": "Name: trends\nType: RESEARCH_FIELD" + }, + { + "entity_name": "yukun cao", + "entity_type": "PERSON", + "description": "yukun cao is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ], + "id": "Name: yukun cao\nType: PERSON" + }, + { + "entity_name": "zengyi gao", + "entity_type": "PERSON", + "description": "zengyi gao is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ], + "id": "Name: zengyi gao\nType: PERSON" + }, + { + "entity_name": "zhiyang li", + "entity_type": "PERSON", + "description": "zhiyang li is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ], + "id": "Name: zhiyang li\nType: PERSON" + }, + { + "entity_name": "xike xie", + "entity_type": "PERSON", + "description": "xike xie is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ], + "id": "Name: xike xie\nType: PERSON" + }, + { + "entity_name": "s kevin zhou", + "entity_type": "PERSON", + "description": "s kevin zhou is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ], + "id": "Name: s kevin zhou\nType: PERSON" + }, + { + "entity_name": "jianliang xu", + "entity_type": "PERSON", + "description": "jianliang xu is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ], + "id": "Name: jianliang xu\nType: PERSON" + }, + { + "entity_name": "lego graphrag", + "entity_type": "PRODUCT", + "description": "lego graphrag is a modularized graph based retrieval augmented generation system designed for design space exploration", + "source_ids": [ + 196 + ], + "id": "Name: lego graphrag\nType: PRODUCT" + }, + { + "entity_name": "proc vldb endow", + "entity_type": "PUBLICATION_VENUE", + "description": "proc vldb endow is the publication venue where the paper was published", + "source_ids": [ + 196 + ], + "id": "Name: proc vldb endow\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "june 2025", + "entity_type": "DATE", + "description": "june 2025 is the specific date of publication for the paper", + "source_ids": [ + 196 + ], + "id": "Name: june 2025\nType: DATE" + }, + { + "entity_name": "3269 3283", + "entity_type": "MEASUREMENT", + "description": "3269 3283 represents the page range of the article in the publication", + "source_ids": [ + 196 + ], + "id": "Name: 3269 3283\nType: MEASUREMENT" + }, + { + "entity_name": "18", + "entity_type": "MEASUREMENT", + "description": "18 is the volume number of the publication and the proceedings of the VLDB Endowment where the paper was published.", + "source_ids": [ + 196, + 197 + ], + "id": "Name: 18\nType: MEASUREMENT" + }, + { + "entity_name": "design space exploration", + "entity_type": "TASK_OR_PROBLEM", + "description": "design space exploration is the specific problem domain that the lego graphrag system is designed to address", + "source_ids": [ + 196 + ], + "id": "Name: design space exploration\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "graph based retrieval augmented generation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "graph based retrieval augmented generation is the underlying technique being modularized in the paper", + "source_ids": [ + 196 + ], + "id": "Name: graph based retrieval augmented generation\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "modularizing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "modularizing is the specific method or approach applied to the graph based retrieval augmented generation system", + "source_ids": [ + 196 + ], + "id": "Name: modularizing\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "https doi org 10 14778 3748191 3748194", + "entity_type": "URL", + "description": "https doi org 10 14778 3748191 3748194 is the digital object identifier link for the paper", + "source_ids": [ + 196 + ], + "id": "Name: https doi org 10 14778 3748191 3748194\nType: URL" + }, + { + "entity_name": "chengliang chai", + "entity_type": "PERSON", + "description": "Chengliang Chai is an author who has contributed to the paper titled \"Doctopus: Budget Aware Structural Table Extraction from Unstructured Documents\" and is also listed as one of the authors of the paper titled \"Haipipe.\"", + "source_ids": [ + 200, + 197 + ], + "id": "Name: chengliang chai\nType: PERSON" + }, + { + "entity_name": "jiajun li", + "entity_type": "PERSON", + "description": "jiajun li is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ], + "id": "Name: jiajun li\nType: PERSON" + }, + { + "entity_name": "yuhao deng", + "entity_type": "PERSON", + "description": "yuhao deng is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ], + "id": "Name: yuhao deng\nType: PERSON" + }, + { + "entity_name": "yuanhao zhong", + "entity_type": "PERSON", + "description": "yuanhao zhong is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ], + "id": "Name: yuanhao zhong\nType: PERSON" + }, + { + "entity_name": "ye yuan", + "entity_type": "PERSON", + "description": "ye yuan is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ], + "id": "Name: ye yuan\nType: PERSON" + }, + { + "entity_name": "guoren wang", + "entity_type": "PERSON", + "description": "guoren wang is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ], + "id": "Name: guoren wang\nType: PERSON" + }, + { + "entity_name": "lei cao", + "entity_type": "PERSON", + "description": "lei cao is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ], + "id": "Name: lei cao\nType: PERSON" + }, + { + "entity_name": "doctopus", + "entity_type": "PRODUCT", + "description": "doctopus is a system or method for budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ], + "id": "Name: doctopus\nType: PRODUCT" + }, + { + "entity_name": "budget aware structural table extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "budget aware structural table extraction is the specific task addressed by the doctopus system described in the text", + "source_ids": [ + 197 + ], + "id": "Name: budget aware structural table extraction\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "unstructured documents", + "entity_type": "DATASET_OR_CORPUS", + "description": "unstructured documents are the source material from which structural tables are extracted in the described work", + "source_ids": [ + 197 + ], + "id": "Name: unstructured documents\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "11", + "entity_type": "MEASUREMENT", + "description": "11 is the issue number of the proceedings of the vldb endowment where the paper was published", + "source_ids": [ + 197 + ], + "id": "Name: 11\nType: MEASUREMENT" + }, + { + "entity_name": "3695 3707", + "entity_type": "MEASUREMENT", + "description": "3695 3707 represents the page range of the paper within the publication", + "source_ids": [ + 197 + ], + "id": "Name: 3695 3707\nType: MEASUREMENT" + }, + { + "entity_name": "ilias chalkidis", + "entity_type": "PERSON", + "description": "ilias chalkidis is one of the authors of the 2020 arxiv preprint titled legal bert", + "source_ids": [ + 198 + ], + "id": "Name: ilias chalkidis\nType: PERSON" + }, + { + "entity_name": "manos fergadiotis", + "entity_type": "PERSON", + "description": "manos fergadiotis is one of the authors of the 2020 arxiv preprint titled legal bert", + "source_ids": [ + 198 + ], + "id": "Name: manos fergadiotis\nType: PERSON" + }, + { + "entity_name": "prodromos malakasiotis", + "entity_type": "PERSON", + "description": "prodromos malakasiotis is one of the authors of the 2020 arxiv preprint titled legal bert", + "source_ids": [ + 198 + ], + "id": "Name: prodromos malakasiotis\nType: PERSON" + }, + { + "entity_name": "nikolaos aletras", + "entity_type": "PERSON", + "description": "nikolaos aletras is one of the authors of the 2020 arxiv preprint titled legal bert", + "source_ids": [ + 198 + ], + "id": "Name: nikolaos aletras\nType: PERSON" + }, + { + "entity_name": "ion androutsopoulos", + "entity_type": "PERSON", + "description": "ion androutsopoulos is one of the authors of the 2020 arxiv preprint titled legal bert", + "source_ids": [ + 198 + ], + "id": "Name: ion androutsopoulos\nType: PERSON" + }, + { + "entity_name": "legal bert", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "legal bert is a model described as the muppets straight out of law school in the text", + "source_ids": [ + 198 + ], + "id": "Name: legal bert\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "arxiv preprint arxiv 2010 02559", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint arxiv 2010 02559 is the specific publication venue and identifier for the paper", + "source_ids": [ + 198 + ], + "id": "Name: arxiv preprint arxiv 2010 02559\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "2020", + "entity_type": "DATE", + "description": "2020 is the year the paper was published.", + "source_ids": [ + 202, + 198 + ], + "id": "Name: 2020\nType: DATE" + }, + { + "entity_name": "muppets", + "entity_type": "PRODUCT", + "description": "muppets is a metaphorical term used in the text to describe the legal bert model", + "source_ids": [ + 198 + ], + "id": "Name: muppets\nType: PRODUCT" + }, + { + "entity_name": "law school", + "entity_type": "LOCATION", + "description": "law school is a location mentioned metaphorically to indicate the origin or training context of the legal bert model", + "source_ids": [ + 198 + ], + "id": "Name: law school\nType: LOCATION" + }, + { + "entity_name": "sibei chen", + "entity_type": "PERSON", + "description": "Sibei Chen is an author of the paper titled \"Auto Formula\" and is also listed as one of the authors of the paper titled \"Haipipe.\"", + "source_ids": [ + 200, + 199 + ], + "id": "Name: sibei chen\nType: PERSON" + }, + { + "entity_name": "yeye he", + "entity_type": "PERSON", + "description": "yeye he is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ], + "id": "Name: yeye he\nType: PERSON" + }, + { + "entity_name": "weiwei cui", + "entity_type": "PERSON", + "description": "weiwei cui is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ], + "id": "Name: weiwei cui\nType: PERSON" + }, + { + "entity_name": "ju fan", + "entity_type": "PERSON", + "description": "Ju Fan is listed as an author of the paper titled Auto Formula and is also one of the authors of the paper titled Haipipe.", + "source_ids": [ + 200, + 199 + ], + "id": "Name: ju fan\nType: PERSON" + }, + { + "entity_name": "song ge", + "entity_type": "PERSON", + "description": "song ge is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ], + "id": "Name: song ge\nType: PERSON" + }, + { + "entity_name": "haidong zhang", + "entity_type": "PERSON", + "description": "haidong zhang is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ], + "id": "Name: haidong zhang\nType: PERSON" + }, + { + "entity_name": "dongmei zhang", + "entity_type": "PERSON", + "description": "dongmei zhang is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ], + "id": "Name: dongmei zhang\nType: PERSON" + }, + { + "entity_name": "surajit chaudhuri", + "entity_type": "PERSON", + "description": "surajit chaudhuri is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ], + "id": "Name: surajit chaudhuri\nType: PERSON" + }, + { + "entity_name": "auto formula", + "entity_type": "PRODUCT", + "description": "auto formula is a system or method recommended in the paper for recommending formulas in spreadsheets using contrastive learning", + "source_ids": [ + 199 + ], + "id": "Name: auto formula\nType: PRODUCT" + }, + { + "entity_name": "proceedings of the acm on management of data", + "entity_type": "PUBLICATION_VENUE", + "description": "Proceedings of the ACM on Management of Data is a publication venue where papers, including those published in 2024, are released.", + "source_ids": [ + 200, + 199 + ], + "id": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "1 27", + "entity_type": "MEASUREMENT", + "description": "1 27 represents the page range of the article in the publication", + "source_ids": [ + 199 + ], + "id": "Name: 1 27\nType: MEASUREMENT" + }, + { + "entity_name": "table representations", + "entity_type": "DATASET_OR_CORPUS", + "description": "table representations is the subject of the contrastive learning method used in the paper", + "source_ids": [ + 199 + ], + "id": "Name: table representations\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "contrastive learning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "contrastive learning is the technique used to recommend formulas in spreadsheets", + "source_ids": [ + 199 + ], + "id": "Name: contrastive learning\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "spreadsheets", + "entity_type": "PRODUCT", + "description": "spreadsheets are the application domain where the auto formula system recommends formulas", + "source_ids": [ + 199 + ], + "id": "Name: spreadsheets\nType: PRODUCT" + }, + { + "entity_name": "formulas", + "entity_type": "PRODUCT", + "description": "formulas are the specific items being recommended by the auto formula system", + "source_ids": [ + 199 + ], + "id": "Name: formulas\nType: PRODUCT" + }, + { + "entity_name": "nan tang", + "entity_type": "PERSON", + "description": "nan tang is listed as one of the authors of the paper titled haipipe", + "source_ids": [ + 200 + ], + "id": "Name: nan tang\nType: PERSON" + }, + { + "entity_name": "xuemi yan", + "entity_type": "PERSON", + "description": "xuemi yan is listed as one of the authors of the paper titled haipipe", + "source_ids": [ + 200 + ], + "id": "Name: xuemi yan\nType: PERSON" + }, + { + "entity_name": "guoliang li", + "entity_type": "PERSON", + "description": "guoliang li is listed as one of the authors of the paper titled haipipe", + "source_ids": [ + 200 + ], + "id": "Name: guoliang li\nType: PERSON" + }, + { + "entity_name": "xiaoyong du", + "entity_type": "PERSON", + "description": "xiaoyong du is listed as one of the authors of the paper titled haipipe", + "source_ids": [ + 200 + ], + "id": "Name: xiaoyong du\nType: PERSON" + }, + { + "entity_name": "haipipe", + "entity_type": "PRODUCT", + "description": "haipipe is a system or method described in the paper that combines human generated and machine generated pipelines for data preparation", + "source_ids": [ + 200 + ], + "id": "Name: haipipe\nType: PRODUCT" + }, + { + "entity_name": "1 26", + "entity_type": "MEASUREMENT", + "description": "1 26 refers to the page range of the paper in the publication", + "source_ids": [ + 200 + ], + "id": "Name: 1 26\nType: MEASUREMENT" + }, + { + "entity_name": "acm", + "entity_type": "ORGANIZATION", + "description": "acm is the organization associated with the publication venue mentioned in the text", + "source_ids": [ + 200 + ], + "id": "Name: acm\nType: ORGANIZATION" + }, + { + "entity_name": "data preparation", + "entity_type": "TASK_OR_PROBLEM", + "description": "data preparation is the specific task addressed by the haipipe system described in the text", + "source_ids": [ + 200 + ], + "id": "Name: data preparation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "human generated pipelines", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "human generated pipelines are a type of pipeline combined with machine generated ones in the haipipe system", + "source_ids": [ + 200 + ], + "id": "Name: human generated pipelines\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "machine generated pipelines", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "machine generated pipelines are a type of pipeline combined with human generated ones in the haipipe system", + "source_ids": [ + 200 + ], + "id": "Name: machine generated pipelines\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "jaemin cho", + "entity_type": "PERSON", + "description": "jaemin cho is an author of the 2024 arxiv preprint titled m3docrag", + "source_ids": [ + 201 + ], + "id": "Name: jaemin cho\nType: PERSON" + }, + { + "entity_name": "debanjan mahata", + "entity_type": "PERSON", + "description": "debanjan mahata is an author of the 2024 arxiv preprint titled m3docrag", + "source_ids": [ + 201 + ], + "id": "Name: debanjan mahata\nType: PERSON" + }, + { + "entity_name": "ozan irsoy", + "entity_type": "PERSON", + "description": "ozan irsoy is an author of the 2024 arxiv preprint titled m3docrag", + "source_ids": [ + 201 + ], + "id": "Name: ozan irsoy\nType: PERSON" + }, + { + "entity_name": "yujie he", + "entity_type": "PERSON", + "description": "yujie he is an author of the 2024 arxiv preprint titled m3docrag", + "source_ids": [ + 201 + ], + "id": "Name: yujie he\nType: PERSON" + }, + { + "entity_name": "mohit bansal", + "entity_type": "PERSON", + "description": "mohit bansal is an author of the 2024 arxiv preprint titled m3docrag", + "source_ids": [ + 201 + ], + "id": "Name: mohit bansal\nType: PERSON" + }, + { + "entity_name": "m3docrag", + "entity_type": "PRODUCT", + "description": "m3docrag is a multi modal retrieval system designed for multi page multidocument understanding", + "source_ids": [ + 201 + ], + "id": "Name: m3docrag\nType: PRODUCT" + }, + { + "entity_name": "arxiv 2411 04952", + "entity_type": "FILE_TYPE", + "description": "arxiv 2411 04952 is the specific identifier for the m3docrag preprint", + "source_ids": [ + 201 + ], + "id": "Name: arxiv 2411 04952\nType: FILE_TYPE" + }, + { + "entity_name": "multi modal retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "multi modal retrieval is the technique described as what is needed for multi page multidocument understanding in the m3docrag paper", + "source_ids": [ + 201 + ], + "id": "Name: multi modal retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "multi page multidocument understanding", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi page multidocument understanding is the specific task or problem that the m3docrag system addresses", + "source_ids": [ + 201 + ], + "id": "Name: multi page multidocument understanding\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "arxiv preprint", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint is the type of publication venue where the m3docrag paper was released", + "source_ids": [ + 201 + ], + "id": "Name: arxiv preprint\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "vassilis christophides", + "entity_type": "PERSON", + "description": "vassilis christophides is an author of the 2020 paper on end to end entity resolution for big data", + "source_ids": [ + 202 + ], + "id": "Name: vassilis christophides\nType: PERSON" + }, + { + "entity_name": "vasilis efthymiou", + "entity_type": "PERSON", + "description": "vasilis efthymiou is an author of the 2020 paper on end to end entity resolution for big data", + "source_ids": [ + 202 + ], + "id": "Name: vasilis efthymiou\nType: PERSON" + }, + { + "entity_name": "themis palpanas", + "entity_type": "PERSON", + "description": "themis palpanas is an author of the 2020 paper on end to end entity resolution for big data", + "source_ids": [ + 202 + ], + "id": "Name: themis palpanas\nType: PERSON" + }, + { + "entity_name": "george papadakis", + "entity_type": "PERSON", + "description": "george papadakis is an author of the 2020 paper on end to end entity resolution for big data", + "source_ids": [ + 202 + ], + "id": "Name: george papadakis\nType: PERSON" + }, + { + "entity_name": "kostas stefanidis", + "entity_type": "PERSON", + "description": "kostas stefanidis is an author of the 2020 paper on end to end entity resolution for big data", + "source_ids": [ + 202 + ], + "id": "Name: kostas stefanidis\nType: PERSON" + }, + { + "entity_name": "acm computing surveys", + "entity_type": "PUBLICATION_VENUE", + "description": "acm computing surveys is the journal where the paper was published", + "source_ids": [ + 202 + ], + "id": "Name: acm computing surveys\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "an overview of end to end entity resolution for big data", + "entity_type": "BOOK", + "description": "an overview of end to end entity resolution for big data is the title of the paper discussed in the text", + "source_ids": [ + 202 + ], + "id": "Name: an overview of end to end entity resolution for big data\nType: BOOK" + }, + { + "entity_name": "csur", + "entity_type": "PUBLICATION_VENUE", + "description": "csur is the abbreviation for acm computing surveys the journal where the paper was published", + "source_ids": [ + 202 + ], + "id": "Name: csur\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "53", + "entity_type": "MEASUREMENT", + "description": "53 is the volume number of the journal acm computing surveys where the paper was published", + "source_ids": [ + 202 + ], + "id": "Name: 53\nType: MEASUREMENT" + }, + { + "entity_name": "1 42", + "entity_type": "MEASUREMENT", + "description": "1 42 represents the page range of the paper within the journal", + "source_ids": [ + 202 + ], + "id": "Name: 1 42\nType: MEASUREMENT" + }, + { + "entity_name": "end to end entity resolution", + "entity_type": "TASK_OR_PROBLEM", + "description": "end to end entity resolution is the specific technical problem addressed in the paper", + "source_ids": [ + 202 + ], + "id": "Name: end to end entity resolution\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "big data", + "entity_type": "DATASET_OR_CORPUS", + "description": "big data is the domain or subject matter discussed in the paper", + "source_ids": [ + 202 + ], + "id": "Name: big data\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "gheorghe comanici", + "entity_type": "PERSON", + "description": "gheorghe comanici is listed as one of the authors of the paper", + "source_ids": [ + 203 + ], + "id": "Name: gheorghe comanici\nType: PERSON" + }, + { + "entity_name": "eric bieber", + "entity_type": "PERSON", + "description": "eric bieber is listed as one of the authors of the paper", + "source_ids": [ + 203 + ], + "id": "Name: eric bieber\nType: PERSON" + }, + { + "entity_name": "mike schaekermann", + "entity_type": "PERSON", + "description": "mike schaekermann is listed as one of the authors of the paper", + "source_ids": [ + 203 + ], + "id": "Name: mike schaekermann\nType: PERSON" + }, + { + "entity_name": "ice pasupat", + "entity_type": "PERSON", + "description": "ice pasupat is listed as one of the authors of the paper", + "source_ids": [ + 203 + ], + "id": "Name: ice pasupat\nType: PERSON" + }, + { + "entity_name": "noveen sachdeva", + "entity_type": "PERSON", + "description": "noveen sachdeva is listed as one of the authors of the paper", + "source_ids": [ + 203 + ], + "id": "Name: noveen sachdeva\nType: PERSON" + }, + { + "entity_name": "inderjit dhillon", + "entity_type": "PERSON", + "description": "inderjit dhillon is listed as one of the authors of the paper", + "source_ids": [ + 203 + ], + "id": "Name: inderjit dhillon\nType: PERSON" + }, + { + "entity_name": "marcel blistein", + "entity_type": "PERSON", + "description": "marcel blistein is listed as one of the authors of the paper", + "source_ids": [ + 203 + ], + "id": "Name: marcel blistein\nType: PERSON" + }, + { + "entity_name": "ori ram", + "entity_type": "PERSON", + "description": "ori ram is listed as one of the authors of the paper", + "source_ids": [ + 203 + ], + "id": "Name: ori ram\nType: PERSON" + }, + { + "entity_name": "dan zhang", + "entity_type": "PERSON", + "description": "dan zhang is listed as one of the authors of the paper", + "source_ids": [ + 203 + ], + "id": "Name: dan zhang\nType: PERSON" + }, + { + "entity_name": "evan rosen", + "entity_type": "PERSON", + "description": "evan rosen is listed as one of the authors of the paper", + "source_ids": [ + 203 + ], + "id": "Name: evan rosen\nType: PERSON" + }, + { + "entity_name": "arxiv 2507 06261", + "entity_type": "FILE_TYPE", + "description": "arxiv 2507 06261 is the specific identifier for the preprint document", + "source_ids": [ + 203 + ], + "id": "Name: arxiv 2507 06261\nType: FILE_TYPE" + }, + { + "entity_name": "advanced reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "advanced reasoning is a capability of gemini 2 5 mentioned in the text", + "source_ids": [ + 203 + ], + "id": "Name: advanced reasoning\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "multimodality", + "entity_type": "TASK_OR_PROBLEM", + "description": "multimodality is a capability of gemini 2 5 mentioned in the text", + "source_ids": [ + 203 + ], + "id": "Name: multimodality\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "long context", + "entity_type": "TASK_OR_PROBLEM", + "description": "long context is a capability of gemini 2 5 mentioned in the text", + "source_ids": [ + 203 + ], + "id": "Name: long context\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "next generation agentic capabilities", + "entity_type": "TASK_OR_PROBLEM", + "description": "next generation agentic capabilities are capabilities of gemini 2 5 mentioned in the text", + "source_ids": [ + 203 + ], + "id": "Name: next generation agentic capabilities\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "gemini 2 5 pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities", + "entity_type": "BOOK", + "description": "gemini 2 5 pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities is the title of the paper", + "source_ids": [ + 203 + ], + "id": "Name: gemini 2 5 pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities\nType: BOOK" + }, + { + "entity_name": "arxiv preprint", + "entity_type": "FILE_TYPE", + "description": "An arXiv preprint is a type of document in which research work is published, describing the format of the document released prior to formal peer-reviewed publication.", + "source_ids": [ + 203, + 211 + ], + "id": "Name: arxiv preprint\nType: FILE_TYPE" + }, + { + "entity_name": "pradeep dasigi", + "entity_type": "PERSON", + "description": "pradeep dasigi is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ], + "id": "Name: pradeep dasigi\nType: PERSON" + }, + { + "entity_name": "kyle lo", + "entity_type": "PERSON", + "description": "kyle lo is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ], + "id": "Name: kyle lo\nType: PERSON" + }, + { + "entity_name": "iz beltagy", + "entity_type": "PERSON", + "description": "iz beltagy is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ], + "id": "Name: iz beltagy\nType: PERSON" + }, + { + "entity_name": "arman cohan", + "entity_type": "PERSON", + "description": "arman cohan is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ], + "id": "Name: arman cohan\nType: PERSON" + }, + { + "entity_name": "noah a smith", + "entity_type": "PERSON", + "description": "noah a smith is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ], + "id": "Name: noah a smith\nType: PERSON" + }, + { + "entity_name": "matt gardner", + "entity_type": "PERSON", + "description": "matt gardner is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ], + "id": "Name: matt gardner\nType: PERSON" + }, + { + "entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "entity_type": "PRODUCT", + "description": "a dataset of information seeking questions and answers anchored in research papers is the title of the work described in the text", + "source_ids": [ + 204 + ], + "id": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT" + }, + { + "entity_name": "arxiv preprint arxiv 2105 03011", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint arxiv 2105 03011 is the specific publication venue and identifier for the work", + "source_ids": [ + 204 + ], + "id": "Name: arxiv preprint arxiv 2105 03011\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "2021", + "entity_type": "DATE", + "description": "2021 is the year the preprint was published", + "source_ids": [ + 204 + ], + "id": "Name: 2021\nType: DATE" + }, + { + "entity_name": "research papers", + "entity_type": "DATASET_OR_CORPUS", + "description": "research papers are the source material from which the information seeking questions and answers are anchored", + "source_ids": [ + 204 + ], + "id": "Name: research papers\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "information seeking questions", + "entity_type": "TASK_OR_PROBLEM", + "description": "information seeking questions are the specific type of queries included in the dataset", + "source_ids": [ + 204 + ], + "id": "Name: information seeking questions\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "answers", + "entity_type": "TASK_OR_PROBLEM", + "description": "answers are the responses paired with the questions in the dataset", + "source_ids": [ + 204 + ], + "id": "Name: answers\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "xavier daull", + "entity_type": "PERSON", + "description": "xavier daull is one of the authors of the 2023 survey on complex qa and language models hybrid architectures", + "source_ids": [ + 205 + ], + "id": "Name: xavier daull\nType: PERSON" + }, + { + "entity_name": "patrice bellot", + "entity_type": "PERSON", + "description": "patrice bellot is one of the authors of the 2023 survey on complex qa and language models hybrid architectures", + "source_ids": [ + 205 + ], + "id": "Name: patrice bellot\nType: PERSON" + }, + { + "entity_name": "emmanuel bruno", + "entity_type": "PERSON", + "description": "emmanuel bruno is one of the authors of the 2023 survey on complex qa and language models hybrid architectures", + "source_ids": [ + 205 + ], + "id": "Name: emmanuel bruno\nType: PERSON" + }, + { + "entity_name": "vincent martin", + "entity_type": "PERSON", + "description": "vincent martin is one of the authors of the 2023 survey on complex qa and language models hybrid architectures", + "source_ids": [ + 205 + ], + "id": "Name: vincent martin\nType: PERSON" + }, + { + "entity_name": "elisabeth murisasco", + "entity_type": "PERSON", + "description": "elisabeth murisasco is one of the authors of the 2023 survey on complex qa and language models hybrid architectures", + "source_ids": [ + 205 + ], + "id": "Name: elisabeth murisasco\nType: PERSON" + }, + { + "entity_name": "arxiv preprint arxiv 2302 09051", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint arxiv 2302 09051 is the specific identifier for the preprint where the survey was published", + "source_ids": [ + 205 + ], + "id": "Name: arxiv preprint arxiv 2302 09051\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "complex qa and language models hybrid architectures survey", + "entity_type": "BOOK", + "description": "complex qa and language models hybrid architectures survey is the title of the work authored by the listed individuals", + "source_ids": [ + 205 + ], + "id": "Name: complex qa and language models hybrid architectures survey\nType: BOOK" + }, + { + "entity_name": "2302 09051", + "entity_type": "FILE_TYPE", + "description": "2302 09051 is the unique identifier code for the specific preprint document", + "source_ids": [ + 205 + ], + "id": "Name: 2302 09051\nType: FILE_TYPE" + }, + { + "entity_name": "darren edge", + "entity_type": "PERSON", + "description": "darren edge is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ], + "id": "Name: darren edge\nType: PERSON" + }, + { + "entity_name": "ha trinh", + "entity_type": "PERSON", + "description": "ha trinh is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ], + "id": "Name: ha trinh\nType: PERSON" + }, + { + "entity_name": "newman cheng", + "entity_type": "PERSON", + "description": "newman cheng is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ], + "id": "Name: newman cheng\nType: PERSON" + }, + { + "entity_name": "joshua bradley", + "entity_type": "PERSON", + "description": "joshua bradley is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ], + "id": "Name: joshua bradley\nType: PERSON" + }, + { + "entity_name": "alex chao", + "entity_type": "PERSON", + "description": "alex chao is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ], + "id": "Name: alex chao\nType: PERSON" + }, + { + "entity_name": "apurva mody", + "entity_type": "PERSON", + "description": "apurva mody is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ], + "id": "Name: apurva mody\nType: PERSON" + }, + { + "entity_name": "steven truitt", + "entity_type": "PERSON", + "description": "steven truitt is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ], + "id": "Name: steven truitt\nType: PERSON" + }, + { + "entity_name": "jonathan larson", + "entity_type": "PERSON", + "description": "jonathan larson is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ], + "id": "Name: jonathan larson\nType: PERSON" + }, + { + "entity_name": "from local to global a graph rag approach to query focused summarization", + "entity_type": "BOOK", + "description": "from local to global a graph rag approach to query focused summarization is the title of the arxiv preprint", + "source_ids": [ + 206 + ], + "id": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK" + }, + { + "entity_name": "arxiv 2404 16130", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv 2404 16130 is the specific identifier for the preprint document", + "source_ids": [ + 206 + ], + "id": "Name: arxiv 2404 16130\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "graph rag", + "entity_type": "TECHNOLOGY", + "description": "graph rag is a technology approach mentioned in the title of the paper as a method for query focused summarization", + "source_ids": [ + 206 + ], + "id": "Name: graph rag\nType: TECHNOLOGY" + }, + { + "entity_name": "query focused summarization", + "entity_type": "TASK_OR_PROBLEM", + "description": "query focused summarization is the specific task or problem addressed by the graph rag approach in the paper", + "source_ids": [ + 206 + ], + "id": "Name: query focused summarization\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "local", + "entity_type": "CONCEPT", + "description": "local refers to a scope or scale mentioned in the paper s title contrasting with global", + "source_ids": [ + 206 + ], + "id": "Name: local\nType: CONCEPT" + }, + { + "entity_name": "global", + "entity_type": "CONCEPT", + "description": "Global refers to a scope or scale mentioned in the paper's title, contrasting with local, and also denotes a process that filters for all items of a specific type, such as a table.", + "source_ids": [ + 251, + 206 + ], + "id": "Name: global\nType: CONCEPT" + }, + { + "entity_name": "yunfan gao", + "entity_type": "PERSON", + "description": "yunfan gao is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ], + "id": "Name: yunfan gao\nType: PERSON" + }, + { + "entity_name": "yun xiong", + "entity_type": "PERSON", + "description": "yun xiong is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ], + "id": "Name: yun xiong\nType: PERSON" + }, + { + "entity_name": "xinyu gao", + "entity_type": "PERSON", + "description": "xinyu gao is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ], + "id": "Name: xinyu gao\nType: PERSON" + }, + { + "entity_name": "kangxiang jia", + "entity_type": "PERSON", + "description": "kangxiang jia is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ], + "id": "Name: kangxiang jia\nType: PERSON" + }, + { + "entity_name": "jinliu pan", + "entity_type": "PERSON", + "description": "jinliu pan is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ], + "id": "Name: jinliu pan\nType: PERSON" + }, + { + "entity_name": "yuxi bi", + "entity_type": "PERSON", + "description": "yuxi bi is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ], + "id": "Name: yuxi bi\nType: PERSON" + }, + { + "entity_name": "yi dai", + "entity_type": "PERSON", + "description": "yi dai is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ], + "id": "Name: yi dai\nType: PERSON" + }, + { + "entity_name": "jiawei sun", + "entity_type": "PERSON", + "description": "jiawei sun is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ], + "id": "Name: jiawei sun\nType: PERSON" + }, + { + "entity_name": "haofen wang", + "entity_type": "PERSON", + "description": "haofen wang is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ], + "id": "Name: haofen wang\nType: PERSON" + }, + { + "entity_name": "retrieval augmented generation for large language models a survey", + "entity_type": "BOOK", + "description": "retrieval augmented generation for large language models a survey is the title of the document authored by the listed individuals", + "source_ids": [ + 207 + ], + "id": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "entity_name": "arxiv preprint arxiv 2312 10997", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint arxiv 2312 10997 is the specific identifier and venue where the survey was published", + "source_ids": [ + 207 + ], + "id": "Name: arxiv preprint arxiv 2312 10997\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "2312 10997", + "entity_type": "FILE_TYPE", + "description": "2312 10997 is the unique identifier code for the preprint document", + "source_ids": [ + 207 + ], + "id": "Name: 2312 10997\nType: FILE_TYPE" + }, + { + "entity_name": "zirui guo", + "entity_type": "PERSON", + "description": "zirui guo is listed as an author of the paper titled lightrag", + "source_ids": [ + 208 + ], + "id": "Name: zirui guo\nType: PERSON" + }, + { + "entity_name": "lianghao xia", + "entity_type": "PERSON", + "description": "lianghao xia is listed as an author of the paper titled lightrag", + "source_ids": [ + 208 + ], + "id": "Name: lianghao xia\nType: PERSON" + }, + { + "entity_name": "yanhua yu", + "entity_type": "PERSON", + "description": "yanhua yu is listed as an author of the paper titled lightrag", + "source_ids": [ + 208 + ], + "id": "Name: yanhua yu\nType: PERSON" + }, + { + "entity_name": "tu ao", + "entity_type": "PERSON", + "description": "tu ao is listed as an author of the paper titled lightrag", + "source_ids": [ + 208 + ], + "id": "Name: tu ao\nType: PERSON" + }, + { + "entity_name": "chao huang", + "entity_type": "PERSON", + "description": "chao huang is listed as an author of the paper titled lightrag", + "source_ids": [ + 208 + ], + "id": "Name: chao huang\nType: PERSON" + }, + { + "entity_name": "lightrag", + "entity_type": "PRODUCT", + "description": "lightrag is a retrieval augmented generation system described as simple and fast", + "source_ids": [ + 208 + ], + "id": "Name: lightrag\nType: PRODUCT" + }, + { + "entity_name": "arxiv e prints", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv e prints is the publication venue where the paper was released in 2024", + "source_ids": [ + 208 + ], + "id": "Name: arxiv e prints\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "arxiv2410", + "entity_type": "FILE_TYPE", + "description": "arxiv2410 is the specific identifier for the paper on arxiv", + "source_ids": [ + 208 + ], + "id": "Name: arxiv2410\nType: FILE_TYPE" + }, + { + "entity_name": "simple", + "entity_type": "CONCEPT", + "description": "simple is an attribute used to describe the lightrag system", + "source_ids": [ + 208 + ], + "id": "Name: simple\nType: CONCEPT" + }, + { + "entity_name": "fast", + "entity_type": "CONCEPT", + "description": "fast is an attribute used to describe the lightrag system", + "source_ids": [ + 208 + ], + "id": "Name: fast\nType: CONCEPT" + }, + { + "entity_name": "bernal jim nez guti rrez", + "entity_type": "PERSON", + "description": "bernal jim nez guti rrez is one of the authors of the paper titled hipporag", + "source_ids": [ + 209 + ], + "id": "Name: bernal jim nez guti rrez\nType: PERSON" + }, + { + "entity_name": "yiheng shu", + "entity_type": "PERSON", + "description": "yiheng shu is one of the authors of the paper titled hipporag", + "source_ids": [ + 209 + ], + "id": "Name: yiheng shu\nType: PERSON" + }, + { + "entity_name": "yu gu", + "entity_type": "PERSON", + "description": "yu gu is one of the authors of the paper titled hipporag", + "source_ids": [ + 209 + ], + "id": "Name: yu gu\nType: PERSON" + }, + { + "entity_name": "michihiro yasunaga", + "entity_type": "PERSON", + "description": "michihiro yasunaga is one of the authors of the paper titled hipporag", + "source_ids": [ + 209 + ], + "id": "Name: michihiro yasunaga\nType: PERSON" + }, + { + "entity_name": "yu su", + "entity_type": "PERSON", + "description": "yu su is one of the authors of the paper titled hipporag", + "source_ids": [ + 209 + ], + "id": "Name: yu su\nType: PERSON" + }, + { + "entity_name": "hipporag", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "hipporag is a neurobiologically inspired long term memory system designed for large language models", + "source_ids": [ + 209 + ], + "id": "Name: hipporag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "arxiv 2405 14831", + "entity_type": "FILE_TYPE", + "description": "arxiv 2405 14831 is the specific identifier for the preprint document", + "source_ids": [ + 209 + ], + "id": "Name: arxiv 2405 14831\nType: FILE_TYPE" + }, + { + "entity_name": "large language models", + "entity_type": "PRODUCT", + "description": "large language models are the target systems for which hipporag is designed as a memory solution", + "source_ids": [ + 209 + ], + "id": "Name: large language models\nType: PRODUCT" + }, + { + "entity_name": "neurobiologically inspired long term memory", + "entity_type": "TASK_OR_PROBLEM", + "description": "neurobiologically inspired long term memory is the specific problem domain or concept that hipporag addresses", + "source_ids": [ + 209 + ], + "id": "Name: neurobiologically inspired long term memory\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "taher h haveliwala", + "entity_type": "PERSON", + "description": "taher h haveliwala is the author of the paper titled topic sensitive pagerank", + "source_ids": [ + 210 + ], + "id": "Name: taher h haveliwala\nType: PERSON" + }, + { + "entity_name": "2002", + "entity_type": "DATE", + "description": "2002 is the year the paper topic sensitive pagerank was published", + "source_ids": [ + 210 + ], + "id": "Name: 2002\nType: DATE" + }, + { + "entity_name": "topic sensitive pagerank", + "entity_type": "TECHNOLOGY", + "description": "topic sensitive pagerank is the title of a paper presented at a conference", + "source_ids": [ + 210 + ], + "id": "Name: topic sensitive pagerank\nType: TECHNOLOGY" + }, + { + "entity_name": "11th international conference on world wide web", + "entity_type": "EVENT", + "description": "the 11th international conference on world wide web is the venue where the paper was presented", + "source_ids": [ + 210 + ], + "id": "Name: 11th international conference on world wide web\nType: EVENT" + }, + { + "entity_name": "world wide web", + "entity_type": "TECHNOLOGY", + "description": "world wide web is the technology platform associated with the conference where the paper was presented", + "source_ids": [ + 210 + ], + "id": "Name: world wide web\nType: TECHNOLOGY" + }, + { + "entity_name": "517 526", + "entity_type": "MEASUREMENT", + "description": "517 526 represents the page range of the paper in the conference proceedings", + "source_ids": [ + 210 + ], + "id": "Name: 517 526\nType: MEASUREMENT" + }, + { + "entity_name": "xiaoxin he", + "entity_type": "PERSON", + "description": "xiaoxin he is listed as one of the authors of the paper", + "source_ids": [ + 211 + ], + "id": "Name: xiaoxin he\nType: PERSON" + }, + { + "entity_name": "yijun tian", + "entity_type": "PERSON", + "description": "yijun tian is listed as one of the authors of the paper", + "source_ids": [ + 211 + ], + "id": "Name: yijun tian\nType: PERSON" + }, + { + "entity_name": "yifei sun", + "entity_type": "PERSON", + "description": "yifei sun is listed as one of the authors of the paper", + "source_ids": [ + 211 + ], + "id": "Name: yifei sun\nType: PERSON" + }, + { + "entity_name": "nitesh v chawla", + "entity_type": "PERSON", + "description": "nitesh v chawla is listed as one of the authors of the paper", + "source_ids": [ + 211 + ], + "id": "Name: nitesh v chawla\nType: PERSON" + }, + { + "entity_name": "thomas laurent", + "entity_type": "PERSON", + "description": "thomas laurent is listed as one of the authors of the paper", + "source_ids": [ + 211 + ], + "id": "Name: thomas laurent\nType: PERSON" + }, + { + "entity_name": "yann lecun", + "entity_type": "PERSON", + "description": "yann lecun is listed as one of the authors of the paper", + "source_ids": [ + 211 + ], + "id": "Name: yann lecun\nType: PERSON" + }, + { + "entity_name": "xavier bresson", + "entity_type": "PERSON", + "description": "xavier bresson is listed as one of the authors of the paper", + "source_ids": [ + 211 + ], + "id": "Name: xavier bresson\nType: PERSON" + }, + { + "entity_name": "bryan hooi", + "entity_type": "PERSON", + "description": "bryan hooi is listed as one of the authors of the paper", + "source_ids": [ + 211 + ], + "id": "Name: bryan hooi\nType: PERSON" + }, + { + "entity_name": "g retriever", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "g retriever is a retrieval augmented generation model for textual graph understanding and question answering", + "source_ids": [ + 211 + ], + "id": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "arxiv 2402 07630", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv 2402 07630 is the identifier for the preprint publication", + "source_ids": [ + 211 + ], + "id": "Name: arxiv 2402 07630\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "textual graph understanding", + "entity_type": "TASK_OR_PROBLEM", + "description": "textual graph understanding is a specific task addressed by the g retriever model", + "source_ids": [ + 211 + ], + "id": "Name: textual graph understanding\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "yucheng hu", + "entity_type": "PERSON", + "description": "yucheng hu is one of the authors of the 2024 survey on retrieval augmented language models", + "source_ids": [ + 212 + ], + "id": "Name: yucheng hu\nType: PERSON" + }, + { + "entity_name": "yuxing lu", + "entity_type": "PERSON", + "description": "yuxing lu is one of the authors of the 2024 survey on retrieval augmented language models", + "source_ids": [ + 212 + ], + "id": "Name: yuxing lu\nType: PERSON" + }, + { + "entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "entity_type": "BOOK", + "description": "rag and rau is the title of a survey paper published in 2024", + "source_ids": [ + 212 + ], + "id": "Name: rag and rau a survey on retrieval augmented language model in natural language processing\nType: BOOK" + }, + { + "entity_name": "natural language processing", + "entity_type": "RESEARCH_FIELD", + "description": "natural language processing is the field of study addressed by the survey paper", + "source_ids": [ + 212 + ], + "id": "Name: natural language processing\nType: RESEARCH_FIELD" + }, + { + "entity_name": "arxiv 2404 19543", + "entity_type": "PRODUCT", + "description": "arxiv 2404 19543 is the specific identifier for the preprint paper mentioned in the text", + "source_ids": [ + 212 + ], + "id": "Name: arxiv 2404 19543\nType: PRODUCT" + }, + { + "entity_name": "retrieval augmented language model", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "retrieval augmented language model is the specific technology subject of the survey", + "source_ids": [ + 212 + ], + "id": "Name: retrieval augmented language model\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "soyeong jeong", + "entity_type": "PERSON", + "description": "soyeong jeong is an author of the 2024 arxiv preprint titled adaptive rag", + "source_ids": [ + 213 + ], + "id": "Name: soyeong jeong\nType: PERSON" + }, + { + "entity_name": "jinheon baek", + "entity_type": "PERSON", + "description": "jinheon baek is an author of the 2024 arxiv preprint titled adaptive rag", + "source_ids": [ + 213 + ], + "id": "Name: jinheon baek\nType: PERSON" + }, + { + "entity_name": "adaptive rag", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "adaptive rag is a model described as learning to adapt retrieval augmented large language models through question complexity", + "source_ids": [ + 213 + ], + "id": "Name: adaptive rag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "retrieval augmented large language models", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "retrieval augmented large language models are the subject of adaptation in the adaptive rag study", + "source_ids": [ + 213 + ], + "id": "Name: retrieval augmented large language models\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "arxiv 2403 14403", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv 2403 14403 is the specific identifier for the preprint document", + "source_ids": [ + 213 + ], + "id": "Name: arxiv 2403 14403\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "question complexity", + "entity_type": "TASK_OR_PROBLEM", + "description": "question complexity is the factor through which adaptive rag learns to adapt models", + "source_ids": [ + 213 + ], + "id": "Name: question complexity\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "learning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "learning is the process by which adaptive rag adapts to question complexity", + "source_ids": [ + 213 + ], + "id": "Name: learning\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "13", + "entity_type": "NUMBER", + "description": "13 is a number mentioned in the text though its specific context or role is not defined", + "source_ids": [ + 214 + ], + "id": "Name: 13\nType: NUMBER" + }, + { + "entity_name": "table: node 215...", + "entity_type": "TABLE", + "description": "A table with no available description.", + "source_ids": [ + 215 + ], + "id": "Name: table: node 215...\nType: TABLE" + }, + { + "entity_name": "timo schick", + "entity_type": "PERSON", + "description": "timo schick is listed as one of the authors of the document", + "source_ids": [ + 216 + ], + "id": "Name: timo schick\nType: PERSON" + }, + { + "entity_name": "jane dwivedi yu", + "entity_type": "PERSON", + "description": "jane dwivedi yu is listed as one of the authors of the document", + "source_ids": [ + 216 + ], + "id": "Name: jane dwivedi yu\nType: PERSON" + }, + { + "entity_name": "roberto dess", + "entity_type": "PERSON", + "description": "roberto dess is listed as one of the authors of the document", + "source_ids": [ + 216 + ], + "id": "Name: roberto dess\nType: PERSON" + }, + { + "entity_name": "roberta raileanu", + "entity_type": "PERSON", + "description": "roberta raileanu is listed as one of the authors of the document", + "source_ids": [ + 216 + ], + "id": "Name: roberta raileanu\nType: PERSON" + }, + { + "entity_name": "maria lomeli", + "entity_type": "PERSON", + "description": "maria lomeli is listed as one of the authors of the document", + "source_ids": [ + 216 + ], + "id": "Name: maria lomeli\nType: PERSON" + }, + { + "entity_name": "eric hambro", + "entity_type": "PERSON", + "description": "eric hambro is listed as one of the authors of the document", + "source_ids": [ + 216 + ], + "id": "Name: eric hambro\nType: PERSON" + }, + { + "entity_name": "luke zettlemoyer", + "entity_type": "PERSON", + "description": "luke zettlemoyer is listed as one of the authors of the document", + "source_ids": [ + 216 + ], + "id": "Name: luke zettlemoyer\nType: PERSON" + }, + { + "entity_name": "nicola cancedda", + "entity_type": "PERSON", + "description": "nicola cancedda is listed as one of the authors of the document", + "source_ids": [ + 216 + ], + "id": "Name: nicola cancedda\nType: PERSON" + }, + { + "entity_name": "thomas scialom", + "entity_type": "PERSON", + "description": "thomas scialom is listed as one of the authors of the document", + "source_ids": [ + 216 + ], + "id": "Name: thomas scialom\nType: PERSON" + }, + { + "entity_name": "table: node 217...", + "entity_type": "TABLE", + "description": "A table with no available description.", + "source_ids": [ + 217 + ], + "id": "Name: table: node 217...\nType: TABLE" + }, + { + "entity_name": "table: node 218...", + "entity_type": "TABLE", + "description": "A table with no available description.", + "source_ids": [ + 218 + ], + "id": "Name: table: node 218...\nType: TABLE" + }, + { + "entity_name": "a experimental details", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the main paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section provides the specific configuration, setup, and parameters used to conduct the experiments described in the study.", + "source_ids": [ + 220 + ], + "id": "Name: a experimental details\nType: SECTION_TITLE" + }, + { + "entity_name": "a.1 evaluation metrics", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experimental Details' within the paper 'BookRAG', this section defines the specific quantitative measures used to assess the performance of the retrieval-augmented generation system.", + "source_ids": [ + 221 + ], + "id": "Name: a.1 evaluation metrics\nType: SECTION_TITLE" + }, + { + "entity_name": "main experiments", + "entity_type": "EVENT", + "description": "main experiments are the primary experiments for which metrics are defined and calculated in the text", + "source_ids": [ + 222 + ], + "id": "Name: main experiments\nType: EVENT" + }, + { + "entity_name": "metrics", + "entity_type": "EVALUATION_METRIC", + "description": "metrics are the specific measures defined and calculated in the text for the main experiments", + "source_ids": [ + 222 + ], + "id": "Name: metrics\nType: EVALUATION_METRIC" + }, + { + "entity_name": "definitions", + "entity_type": "CONCEPT", + "description": "definitions are the detailed descriptions provided for the metrics in the text", + "source_ids": [ + 222 + ], + "id": "Name: definitions\nType: CONCEPT" + }, + { + "entity_name": "calculation procedures", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "calculation procedures are the step by step methods described for computing the metrics", + "source_ids": [ + 222 + ], + "id": "Name: calculation procedures\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "standard rag models", + "entity_type": "TECHNOLOGY", + "description": "standard rag models are described as systems that generate free form natural language responses", + "source_ids": [ + 223 + ], + "id": "Name: standard rag models\nType: TECHNOLOGY" + }, + { + "entity_name": "natural language responses", + "entity_type": "PRODUCT", + "description": "natural language responses are the output generated by standard rag models often containing extraneous conversational text", + "source_ids": [ + 223 + ], + "id": "Name: natural language responses\nType: PRODUCT" + }, + { + "entity_name": "ground truth labels", + "entity_type": "PRODUCT", + "description": "ground truth labels are concise reference answers e g option a or 12 5 used for comparison against model outputs", + "source_ids": [ + 223 + ], + "id": "Name: ground truth labels\nType: PRODUCT" + }, + { + "entity_name": "a 1 1 answer extraction and normalization", + "entity_type": "SECTION_TITLE", + "description": "a 1 1 answer extraction and normalization is the title of the section discussing the process of extracting and normalizing answers", + "source_ids": [ + 223 + ], + "id": "Name: a 1 1 answer extraction and normalization\nType: SECTION_TITLE" + }, + { + "entity_name": "option a", + "entity_type": "PRODUCT", + "description": "option a is an example of a concise ground truth label mentioned in the text", + "source_ids": [ + 223 + ], + "id": "Name: option a\nType: PRODUCT" + }, + { + "entity_name": "12 5", + "entity_type": "MEASUREMENT", + "description": "12 5 is an example of a concise ground truth label mentioned in the text", + "source_ids": [ + 223 + ], + "id": "Name: 12 5\nType: MEASUREMENT" + }, + { + "entity_name": "the answer is", + "entity_type": "PRODUCT", + "description": "the answer is is an example of extraneous conversational text that may appear in raw model outputs", + "source_ids": [ + 223 + ], + "id": "Name: the answer is\nType: PRODUCT" + }, + { + "entity_name": "llm based extraction step", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "llm based extraction step is a method used to align model output with the ground truth format before calculation", + "source_ids": [ + 224 + ], + "id": "Name: llm based extraction step\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "rag system", + "entity_type": "SYSTEM", + "description": "rag system is the system that generates the raw response denoted as y raw", + "source_ids": [ + 224 + ], + "id": "Name: rag system\nType: SYSTEM" + }, + { + "entity_name": "llmextract", + "entity_type": "SOFTWARE", + "description": "llmextract is a component or function that extracts key information from the raw response", + "source_ids": [ + 224 + ], + "id": "Name: llmextract\nType: SOFTWARE" + }, + { + "entity_name": "y raw", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "y raw denotes the raw response generated by the rag system", + "source_ids": [ + 224 + ], + "id": "Name: y raw\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "y gold", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "y gold denotes the ground truth", + "source_ids": [ + 224 + ], + "id": "Name: y gold\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "y hat", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "y hat denotes the extracted answer", + "source_ids": [ + 224 + ], + "id": "Name: y hat\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "n", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "n is a standard normalization function applied to y hat and y gold", + "source_ids": [ + 224 + ], + "id": "Name: n\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "equation 16", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 16 defines the relationship between the extracted answer the raw response and the instruction", + "source_ids": [ + 224 + ], + "id": "Name: equation 16\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "official evaluation protocols", + "entity_type": "TASK_OR_PROBLEM", + "description": "official evaluation protocols are the standards followed to ensure the extraction step aligns with the ground truth format", + "source_ids": [ + 224 + ], + "id": "Name: official evaluation protocols\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "key information", + "entity_type": "CONCEPT", + "description": "key information refers to the essential data such as key entities for span extraction that llmextract retrieves", + "source_ids": [ + 224 + ], + "id": "Name: key information\nType: CONCEPT" + }, + { + "entity_name": "key entity", + "entity_type": "CONCEPT", + "description": "key entity is an example of the key information extracted for span extraction", + "source_ids": [ + 224 + ], + "id": "Name: key entity\nType: CONCEPT" + }, + { + "entity_name": "span extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "span extraction is a specific task mentioned as an example of where key entities are extracted", + "source_ids": [ + 224 + ], + "id": "Name: span extraction\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "lowercasing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "lowercasing is a standard normalization technique applied to the text", + "source_ids": [ + 224 + ], + "id": "Name: lowercasing\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "removing punctuation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "removing punctuation is a standard normalization technique applied to the text", + "source_ids": [ + 224 + ], + "id": "Name: removing punctuation\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "instruction", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "instruction is a parameter provided to the llmextract function to guide the extraction process", + "source_ids": [ + 224 + ], + "id": "Name: instruction\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "formula (16)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the predicted output y_hat as a function of raw input and instruction. LaTeX: ˆ 𝑦 = LLMextract ( 𝑦 𝑟𝑎𝑤 , Instruction ) (16)", + "source_ids": [ + 225 + ], + "id": "Name: formula (16)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "a.1.2 qa performance metrics", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experimental Details' within the BookRAG paper, this section defines the specific metrics used to evaluate Question Answering performance, detailing the calculation of Accuracy based on substring inclusion between ground truth and model responses.", + "source_ids": [ + 226 + ], + "id": "Name: a.1.2 qa performance metrics\nType: SECTION_TITLE" + }, + { + "entity_name": "qa performance metrics", + "entity_type": "EVALUATION_METRIC", + "description": "Refers to the set of quantitative measures defined in section A.1.2 for assessing the quality of answers generated by the model.", + "source_ids": [ + 226 + ], + "id": "Name: qa performance metrics\nType: EVALUATION_METRIC" + }, + { + "entity_name": "ground truth (y_gold)", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The reference answer or expected output used as the baseline for calculating accuracy in section A.1.2.", + "source_ids": [ + 226 + ], + "id": "Name: ground truth (y_gold)\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "model response (y_raw)", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The raw output generated by the model, which is compared against the ground truth in section A.1.2.", + "source_ids": [ + 226 + ], + "id": "Name: model response (y_raw)\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "substring inclusion relation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The logical operation (denoted by ⊆) used in section A.1.2 to determine if one text sequence is contained within another for the purpose of evaluation.", + "source_ids": [ + 226 + ], + "id": "Name: substring inclusion relation\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "accuracy inclusion based", + "entity_type": "EVALUATION_METRIC", + "description": "accuracy inclusion based is a soft match metric used to evaluate model predictions by checking if the normalized gold answer is included in the generated response", + "source_ids": [ + 227 + ], + "id": "Name: accuracy inclusion based\nType: EVALUATION_METRIC" + }, + { + "entity_name": "prior works", + "entity_type": "PUBLICATION_VENUE", + "description": "prior works refer to previous research studies cited in the text as a basis for the methodology", + "source_ids": [ + 227 + ], + "id": "Name: prior works\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "3", + "entity_type": "PUBLICATION_VENUE", + "description": "3 is a citation number referring to a specific prior work mentioned in the text", + "source_ids": [ + 227 + ], + "id": "Name: 3\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "34", + "entity_type": "PUBLICATION_VENUE", + "description": "34 is a citation number referring to a specific prior work mentioned in the text", + "source_ids": [ + 227 + ], + "id": "Name: 34\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "46", + "entity_type": "PUBLICATION_VENUE", + "description": "46 is a citation number referring to a specific prior work mentioned in the text", + "source_ids": [ + 227 + ], + "id": "Name: 46\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "soft match metric", + "entity_type": "EVALUATION_METRIC", + "description": "soft match metric is a category of evaluation methods described as being used in the text", + "source_ids": [ + 227 + ], + "id": "Name: soft match metric\nType: EVALUATION_METRIC" + }, + { + "entity_name": "normalized gold answer", + "entity_type": "DATASET_OR_CORPUS", + "description": "normalized gold answer is the reference data used to determine if a prediction is correct", + "source_ids": [ + 227 + ], + "id": "Name: normalized gold answer\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "model s generated response", + "entity_type": "PRODUCT", + "description": "model s generated response is the output produced by the model being evaluated", + "source_ids": [ + 227 + ], + "id": "Name: model s generated response\nType: PRODUCT" + }, + { + "entity_name": "strict exact match", + "entity_type": "EVALUATION_METRIC", + "description": "strict exact match is a comparison method explicitly contrasted with the soft match metric in the text", + "source_ids": [ + 227 + ], + "id": "Name: strict exact match\nType: EVALUATION_METRIC" + }, + { + "entity_name": "formula (17)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the Accuracy metric as the average of an indicator function comparing neighborhood sets. LaTeX: Accuracy = 1 𝑁 𝑁 ∑︁ 𝑖 = 1 I (N( 𝑦 𝑔𝑜𝑙𝑑,𝑖 ) ⊆ N( 𝑦 𝑟𝑎𝑤,𝑖 )) (17)", + "source_ids": [ + 228 + ], + "id": "Name: formula (17)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "formula (18)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the Error Metric (EM) as the average of indicator functions comparing predicted and ground truth labels. LaTeX: EM = 1 𝑁 𝑁 ∑︁ 𝑖 = 1 I (N( ˆ 𝑦 𝑖 ) = N( 𝑦 𝑔𝑜𝑙𝑑,𝑖 )) (18)", + "source_ids": [ + 230 + ], + "id": "Name: formula (18)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "token level f1 score", + "entity_type": "EVALUATION_METRIC", + "description": "token level f1 score is a specific type of f1 score used for questions requiring text span answers", + "source_ids": [ + 231 + ], + "id": "Name: token level f1 score\nType: EVALUATION_METRIC" + }, + { + "entity_name": "r", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "r represents recall calculated as the intersection of extracted and ground truth tokens divided by the ground truth tokens", + "source_ids": [ + 231 + ], + "id": "Name: r\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "f1", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "f1 is the harmonic mean of precision p and recall r calculated using the formula 2 p r p r", + "source_ids": [ + 231 + ], + "id": "Name: f1\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "equation 19", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 19 defines the calculation for the f1 score based on precision and recall", + "source_ids": [ + 231 + ], + "id": "Name: equation 19\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "formula (19)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining Precision, Recall, and F1 score metrics using set intersections. LaTeX: 𝑃 = | 𝑇 ˆ 𝑦 ∩ 𝑇 𝑔𝑜𝑙𝑑 | | 𝑇 ˆ 𝑦 | , 𝑅 = | 𝑇 ˆ 𝑦 ∩ 𝑇 𝑔𝑜𝑙𝑑 | | 𝑇 𝑔𝑜𝑙𝑑 | , F1 = 2 · 𝑃 · 𝑅 𝑃 + 𝑅 (19)", + "source_ids": [ + 232 + ], + "id": "Name: formula (19)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "15", + "entity_type": "MEASUREMENT", + "description": "15 is a numerical value mentioned in the text potentially representing a measurement or count", + "source_ids": [ + 233 + ], + "id": "Name: 15\nType: MEASUREMENT" + }, + { + "entity_name": "a.1.3 retrieval recall", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experimental Details' within the 'BookRAG' paper, this section defines the Retrieval Recall metric used to evaluate retrieval quality based on parsed PDF block granularity (paragraphs, tables, images).", + "source_ids": [ + 234 + ], + "id": "Name: a.1.3 retrieval recall\nType: SECTION_TITLE" + }, + { + "entity_name": "retrieval quality", + "entity_type": "EVALUATION_METRIC", + "description": "The specific aspect of system performance being measured in this section, assessed via the granularity of retrieved blocks.", + "source_ids": [ + 234 + ], + "id": "Name: retrieval quality\nType: EVALUATION_METRIC" + }, + { + "entity_name": "pdf blocks", + "entity_type": "DATASET_OR_CORPUS", + "description": "The fundamental units of data (paragraphs, tables, images) from which ground-truth and retrieved sets are constructed for evaluation.", + "source_ids": [ + 234 + ], + "id": "Name: pdf blocks\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "query q", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The input query variable used to define the set of required ground-truth blocks.", + "source_ids": [ + 234 + ], + "id": "Name: query q\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "b_gold", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The set of manually labeled ground-truth blocks required to answer a given query.", + "source_ids": [ + 234 + ], + "id": "Name: b_gold\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "b_ret", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The set of unique blocks retrieved by the system for a given query.", + "source_ids": [ + 234 + ], + "id": "Name: b_ret\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "recall_ret", + "entity_type": "EVALUATION_METRIC", + "description": "The specific mathematical formula defined in this section to calculate retrieval recall, handling parsing errors.", + "source_ids": [ + 234 + ], + "id": "Name: recall_ret\nType: EVALUATION_METRIC" + }, + { + "entity_name": "formula (20)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the recall metric r_et as a conditional value based on parsing errors and set intersections. LaTeX: Recall 𝑟𝑒𝑡 = ( 0 if parsing error occurs on B 𝑔𝑜𝑙𝑑 | B 𝑟𝑒𝑡 ∩B 𝑔𝑜𝑙𝑑 | | B 𝑔𝑜𝑙𝑑 | otherwise (20)", + "source_ids": [ + 235 + ], + "id": "Name: formula (20)\nType: EQUATION_OR_FORMULA" + }, + { + "entity_name": "ground truth block", + "entity_type": "TASK_OR_PROBLEM", + "description": "a ground truth block is a specific unit of data that may be lost during parsing", + "source_ids": [ + 236 + ], + "id": "Name: ground truth block\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "candidate pool", + "entity_type": "DATASET_OR_CORPUS", + "description": "the candidate pool is a collection of items from which blocks are retrieved", + "source_ids": [ + 236 + ], + "id": "Name: candidate pool\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "recall", + "entity_type": "EVALUATION_METRIC", + "description": "recall is an evaluation metric used to measure the contribution of retrieved blocks", + "source_ids": [ + 236 + ], + "id": "Name: recall\nType: EVALUATION_METRIC" + }, + { + "entity_name": "0", + "entity_type": "NUMBER", + "description": "0 is the specific numerical value representing the recall contribution when a block is lost", + "source_ids": [ + 236 + ], + "id": "Name: 0\nType: NUMBER" + }, + { + "entity_name": "a.2 implementation details", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experimental Details' and following 'Evaluation Metrics', this section provides the specific technical configurations, software environments, and parameter settings used to realize the BookRAG system.", + "source_ids": [ + 237 + ], + "id": "Name: a.2 implementation details\nType: SECTION_TITLE" + }, + { + "entity_name": "python", + "entity_type": "PROGRAMMING_LANGUAGE", + "description": "python is the programming language used to implement bookrag", + "source_ids": [ + 238 + ], + "id": "Name: python\nType: PROGRAMMING_LANGUAGE" + }, + { + "entity_name": "vlm", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "vlm stands for vision language model a type of model within the qwen family used in the experiments", + "source_ids": [ + 238 + ], + "id": "Name: vlm\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "embedding models", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "embedding models are a type of model within the qwen family used for text and multi modal embedding", + "source_ids": [ + 238 + ], + "id": "Name: embedding models\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "qwen3 8b", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "qwen3 8b is the default llm used in the experiments", + "source_ids": [ + 238 + ], + "id": "Name: qwen3 8b\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "qwen2 5vl 30b", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "qwen2 5vl 30b is the vision language model vlm used in the experiments", + "source_ids": [ + 238 + ], + "id": "Name: qwen2 5vl 30b\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "qwen3 embedding 0 6b", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "qwen3 embedding 0 6b is the model used for text embedding", + "source_ids": [ + 238 + ], + "id": "Name: qwen3 embedding 0 6b\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "gme qwen2 vl 2b instruct", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "gme qwen2 vl 2b instruct is the model used for multi modal embedding", + "source_ids": [ + 238 + ], + "id": "Name: gme qwen2 vl 2b instruct\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "qwen3 reranker 4b", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "qwen3 reranker 4b is the model used for reranking", + "source_ids": [ + 238 + ], + "id": "Name: qwen3 reranker 4b\nType: MODEL_OR_ARCHITECTURE" + }, + { + "entity_name": "linux", + "entity_type": "SOFTWARE", + "description": "linux is the operating system on which the experiments were conducted", + "source_ids": [ + 238 + ], + "id": "Name: linux\nType: SOFTWARE" + }, + { + "entity_name": "intel xeon 2 0ghz cpu", + "entity_type": "HARDWARE", + "description": "intel xeon 2 0ghz cpu is the processor used in the high performance server", + "source_ids": [ + 238 + ], + "id": "Name: intel xeon 2 0ghz cpu\nType: HARDWARE" + }, + { + "entity_name": "nvidia geforce rtx a5000", + "entity_type": "HARDWARE", + "description": "nvidia geforce rtx a5000 is the gpu model used in the high performance server", + "source_ids": [ + 238 + ], + "id": "Name: nvidia geforce rtx a5000\nType: HARDWARE" + }, + { + "entity_name": "1024gb", + "entity_type": "MEASUREMENT", + "description": "1024gb refers to the amount of memory in the server", + "source_ids": [ + 238 + ], + "id": "Name: 1024gb\nType: MEASUREMENT" + }, + { + "entity_name": "24 gb", + "entity_type": "MEASUREMENT", + "description": "24 gb refers to the vram capacity of each gpu", + "source_ids": [ + 238 + ], + "id": "Name: 24 gb\nType: MEASUREMENT" + }, + { + "entity_name": "500 tokens", + "entity_type": "MEASUREMENT", + "description": "500 tokens is the standardized chunk size used for document chunking", + "source_ids": [ + 238 + ], + "id": "Name: 500 tokens\nType: MEASUREMENT" + }, + { + "entity_name": "10b parameter scale", + "entity_type": "MEASUREMENT", + "description": "10b parameter scale is the size range of models primarily selected to balance efficiency and effectiveness", + "source_ids": [ + 238 + ], + "id": "Name: 10b parameter scale\nType: MEASUREMENT" + }, + { + "entity_name": "30b version", + "entity_type": "MEASUREMENT", + "description": "the 30b version refers to the specific size of the vlm adopted due to performance deficits in the 8b counterpart", + "source_ids": [ + 238 + ], + "id": "Name: 30b version\nType: MEASUREMENT" + }, + { + "entity_name": "8b counterpart", + "entity_type": "MEASUREMENT", + "description": "the 8b counterpart refers to the smaller version of the vlm that exhibited significant performance deficits", + "source_ids": [ + 238 + ], + "id": "Name: 8b counterpart\nType: MEASUREMENT" + }, + { + "entity_name": "github repository", + "entity_type": "LOCATION", + "description": "the github repository is the location where source code and implementation configurations are publicly available", + "source_ids": [ + 238 + ], + "id": "Name: github repository\nType: LOCATION" + }, + { + "entity_name": "https github com sam234990 bookrag", + "entity_type": "LOCATION", + "description": "https github com sam234990 bookrag is the specific url of the repository", + "source_ids": [ + 238 + ], + "id": "Name: https github com sam234990 bookrag\nType: LOCATION" + }, + { + "entity_name": "baseline methods", + "entity_type": "TASK_OR_PROBLEM", + "description": "baseline methods refer to the existing methods used for fair comparison against bookrag", + "source_ids": [ + 238 + ], + "id": "Name: baseline methods\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "ground truth images", + "entity_type": "IMAGE", + "description": "ground truth images are the correct reference images provided to the models during evaluation", + "source_ids": [ + 238 + ], + "id": "Name: ground truth images\nType: IMAGE" + }, + { + "entity_name": "document chunking", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "document chunking is a technique used to split documents into smaller parts for processing", + "source_ids": [ + 238 + ], + "id": "Name: document chunking\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "retrieval ranking", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "retrieval ranking is a technique used to order retrieved candidates based on relevance", + "source_ids": [ + 238 + ], + "id": "Name: retrieval ranking\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "sequential processing mode", + "entity_type": "TASK_OR_PROBLEM", + "description": "sequential processing mode is the execution mode used to ensure fair comparison of efficiency", + "source_ids": [ + 238 + ], + "id": "Name: sequential processing mode\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "candidate pool", + "entity_type": "TASK_OR_PROBLEM", + "description": "the candidate pool refers to the set of items retrieved for ranking standardized across baselines", + "source_ids": [ + 238 + ], + "id": "Name: candidate pool\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "efficiency", + "entity_type": "CONCEPT", + "description": "efficiency is a key metric balanced against effectiveness in model selection and execution", + "source_ids": [ + 238 + ], + "id": "Name: efficiency\nType: CONCEPT" + }, + { + "entity_name": "effectiveness", + "entity_type": "CONCEPT", + "description": "effectiveness is a key metric balanced against efficiency in model selection and execution", + "source_ids": [ + 238 + ], + "id": "Name: effectiveness\nType: CONCEPT" + }, + { + "entity_name": "performance deficits", + "entity_type": "CONCEPT", + "description": "performance deficits describe the failure of the 8b vlm counterpart to answer correctly", + "source_ids": [ + 238 + ], + "id": "Name: performance deficits\nType: CONCEPT" + }, + { + "entity_name": "reproducibility", + "entity_type": "CONCEPT", + "description": "reproducibility is the goal achieved by making source code and configurations publicly available", + "source_ids": [ + 238 + ], + "id": "Name: reproducibility\nType: CONCEPT" + }, + { + "entity_name": "fair comparison", + "entity_type": "CONCEPT", + "description": "fair comparison is the objective driving the use of unified models and standardized parameters", + "source_ids": [ + 238 + ], + "id": "Name: fair comparison\nType: CONCEPT" + }, + { + "entity_name": "text embedding", + "entity_type": "TASK_OR_PROBLEM", + "description": "text embedding is the task performed by the qwen3 embedding 0 6b model", + "source_ids": [ + 238 + ], + "id": "Name: text embedding\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "multi modal embedding", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi modal embedding is the task performed by the gme qwen2 vl 2b instruct model", + "source_ids": [ + 238 + ], + "id": "Name: multi modal embedding\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "reranking", + "entity_type": "TASK_OR_PROBLEM", + "description": "reranking is the task performed by the qwen3 reranker 4b model", + "source_ids": [ + 238 + ], + "id": "Name: reranking\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "high performance server", + "entity_type": "LOCATION", + "description": "the high performance server is the physical location where all experiments were conducted", + "source_ids": [ + 238 + ], + "id": "Name: high performance server\nType: LOCATION" + }, + { + "entity_name": "implementation configurations", + "entity_type": "PRODUCT", + "description": "implementation configurations refer to the detailed settings used to run the experiments", + "source_ids": [ + 238 + ], + "id": "Name: implementation configurations\nType: PRODUCT" + }, + { + "entity_name": "reference 52", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 52 is the citation for the mineru tool", + "source_ids": [ + 238 + ], + "id": "Name: reference 52\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "reference 4", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 4 is the citation for the qwen2 5vl 30b model", + "source_ids": [ + 238 + ], + "id": "Name: reference 4\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "reference 60", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 60 is the citation for the qwen3 8b model", + "source_ids": [ + 238 + ], + "id": "Name: reference 60\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "reference 63", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 63 is the citation for the gme qwen2 vl 2b instruct model", + "source_ids": [ + 238 + ], + "id": "Name: reference 63\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "reference 64", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 64 is the citation for the qwen3 embedding 0 6b and qwen3 reranker 4b models", + "source_ids": [ + 238 + ], + "id": "Name: reference 64\nType: PUBLICATION_VENUE" + }, + { + "entity_name": "a.3 prompts", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experimental Details' within the BookRAG paper, this section details the specific text prompts engineered and utilized to guide the Retrieval-Augmented Generation (RAG) system in processing complex documents.", + "source_ids": [ + 239 + ], + "id": "Name: a.3 prompts\nType: SECTION_TITLE" + }, + { + "entity_name": "prompts", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Refers to the structured input instructions provided to the language model to elicit specific behaviors or outputs, as defined in section A.3.", + "source_ids": [ + 239 + ], + "id": "Name: prompts\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "agent based query classification", + "entity_type": "TASK_OR_PROBLEM", + "description": "agent based query classification is a task for which prompts are designed as illustrated in figure 10", + "source_ids": [ + 240 + ], + "id": "Name: agent based query classification\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "question decomposition", + "entity_type": "TASK_OR_PROBLEM", + "description": "question decomposition is a task for which prompts are designed as illustrated in figure 11", + "source_ids": [ + 240 + ], + "id": "Name: question decomposition\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "filter operator generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "Filter operator generation is a task for which prompts are designed, as illustrated in figure 12.", + "source_ids": [ + 240, + 259 + ], + "id": "Name: filter operator generation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "entity resolution judgment", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity resolution judgment is a task for which a prompt is employed during the graph construction phase as illustrated in figure 13", + "source_ids": [ + 240 + ], + "id": "Name: entity resolution judgment\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "graph construction phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "graph construction phase is the specific phase during which entity resolution judgment is performed", + "source_ids": [ + 240 + ], + "id": "Name: graph construction phase\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "prompts", + "entity_type": "PRODUCT", + "description": "prompts are the specific designed items mentioned in the text for various tasks", + "source_ids": [ + 240 + ], + "id": "Name: prompts\nType: PRODUCT" + }, + { + "entity_name": "figure 10", + "entity_type": "IMAGE", + "description": "Figure 10 is a visual element illustrating prompts for agent-based query classification, serving as an image within the text that displays a prompt for query classification.", + "source_ids": [ + 240, + 253 + ], + "id": "Name: figure 10\nType: IMAGE" + }, + { + "entity_name": "figure 11", + "entity_type": "IMAGE", + "description": "Figure 11 is a visual element in the text that displays a prompt for query decomposition, illustrating the process of question decomposition.", + "source_ids": [ + 240, + 256 + ], + "id": "Name: figure 11\nType: IMAGE" + }, + { + "entity_name": "figure 12", + "entity_type": "IMAGE", + "description": "Figure 12 is a visual element and image within the text that illustrates and displays the prompt for filter operator generation.", + "source_ids": [ + 240, + 259 + ], + "id": "Name: figure 12\nType: IMAGE" + }, + { + "entity_name": "figure 13", + "entity_type": "IMAGE", + "description": "Figure 13 is a visual element and an image containing a prompt for entity resolution judgment.", + "source_ids": [ + 240, + 284 + ], + "id": "Name: figure 13\nType: IMAGE" + }, + { + "entity_name": "expert query analyzer", + "entity_type": "PERSON", + "description": "an expert query analyzer is a role described as someone tasked with classifying user questions into specific categories", + "source_ids": [ + 241 + ], + "id": "Name: expert query analyzer\nType: PERSON" + }, + { + "entity_name": "simple", + "entity_type": "TASK_OR_PROBLEM", + "description": "simple is one of the three categories used to classify user questions", + "source_ids": [ + 241 + ], + "id": "Name: simple\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "user", + "entity_type": "PERSON", + "description": "The user is the entity providing the query to the AI assistant, whose questions are being classified by the expert query analyzer.", + "source_ids": [ + 241, + 258 + ], + "id": "Name: user\nType: PERSON" + }, + { + "entity_name": "json object", + "entity_type": "FILE_TYPE", + "description": "The json object is the required format for responses from the expert query analyzer, the AI assistant, and systems containing filters and operations. It serves as the standard output structure for various tasks, including responses that contain a single key named \"sub_questions\" with a list of objects, outputs that include the ID of a matching candidate along with an explanation, and other structured data formats required by different components of the system.", + "source_ids": [ + 241, + 258, + 262, + 255 + ], + "id": "Name: json object\nType: FILE_TYPE" + }, + { + "entity_name": "category definitions", + "entity_type": "SECTION_TITLE", + "description": "category definitions is the title of the section containing definitions for entity types", + "source_ids": [ + 242 + ], + "id": "Name: category definitions\nType: SECTION_TITLE" + }, + { + "entity_name": "information", + "entity_type": "CONCEPT", + "description": "information is the data retrieved to answer the question", + "source_ids": [ + 243 + ], + "id": "Name: information\nType: CONCEPT" + }, + { + "entity_name": "document", + "entity_type": "CONCEPT", + "description": "document is the source material containing the information", + "source_ids": [ + 243 + ], + "id": "Name: document\nType: CONCEPT" + }, + { + "entity_name": "paragraph", + "entity_type": "SECTION_TITLE", + "description": "paragraph is an example of a contiguous location within a document", + "source_ids": [ + 243 + ], + "id": "Name: paragraph\nType: SECTION_TITLE" + }, + { + "entity_name": "table", + "entity_type": "SECTION_TITLE", + "description": "table is an example of a contiguous location within a document", + "source_ids": [ + 243 + ], + "id": "Name: table\nType: SECTION_TITLE" + }, + { + "entity_name": "figure", + "entity_type": "SECTION_TITLE", + "description": "figure is an example of a contiguous location within a document", + "source_ids": [ + 243 + ], + "id": "Name: figure\nType: SECTION_TITLE" + }, + { + "entity_name": "single", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 243 + ], + "id": "Name: single\nType: UNKNOWN" + }, + { + "entity_name": "contiguous location", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 243 + ], + "id": "Name: contiguous location\nType: UNKNOWN" + }, + { + "entity_name": "5", + "entity_type": "PERCENTAGE", + "description": "5 represents a specific portion of the latino population mentioned in the context of economic upward mobility", + "source_ids": [ + 246 + ], + "id": "Name: 5\nType: PERCENTAGE" + }, + { + "entity_name": "latinos", + "entity_type": "NATIONALITY", + "description": "latinos are the demographic group whose views on economic upward mobility for their children are being queried", + "source_ids": [ + 246 + ], + "id": "Name: latinos\nType: NATIONALITY" + }, + { + "entity_name": "economic upward mobility", + "entity_type": "TASK_OR_PROBLEM", + "description": "economic upward mobility is the specific issue regarding the children of latinos that is the subject of the inquiry", + "source_ids": [ + 246 + ], + "id": "Name: economic upward mobility\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "children", + "entity_type": "PERSON", + "description": "children are the offspring of the latinos whose economic upward mobility is being discussed", + "source_ids": [ + 246 + ], + "id": "Name: children\nType: PERSON" + }, + { + "entity_name": "personality vector", + "entity_type": "TASK_OR_PROBLEM", + "description": "the personality vector is a concept mentioned in a question regarding its color indicating it is a complex retrieval task", + "source_ids": [ + 249 + ], + "id": "Name: personality vector\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "counting", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "counting is an example of an aggregation operation mentioned in the text", + "source_ids": [ + 250 + ], + "id": "Name: counting\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "listing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "listing is an example of an aggregation operation mentioned in the text", + "source_ids": [ + 250 + ], + "id": "Name: listing\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "summarizing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "summarizing is an example of an aggregation operation mentioned in the text", + "source_ids": [ + 250 + ], + "id": "Name: summarizing\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "structural filter", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "structural filter is a clear filter used to identify items in the set for the global question", + "source_ids": [ + 250 + ], + "id": "Name: structural filter\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "aggregation operation", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 250 + ], + "id": "Name: aggregation operation\nType: UNKNOWN" + }, + { + "entity_name": "items", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 250 + ], + "id": "Name: items\nType: UNKNOWN" + }, + { + "entity_name": "example", + "entity_type": "TASK_OR_PROBLEM", + "description": "example is a task or problem asking how many tables are in the document", + "source_ids": [ + 251 + ], + "id": "Name: example\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "16", + "entity_type": "MEASUREMENT", + "description": "16 is a numerical value mentioned in the text potentially representing a count date or measurement", + "source_ids": [ + 254 + ], + "id": "Name: 16\nType: MEASUREMENT" + }, + { + "entity_name": "user a2gbifl43u1lkj", + "entity_type": "PERSON", + "description": "user a2gbifl43u1lkj is a specific user referenced in the example query regarding personality vectors and receptiviti scores", + "source_ids": [ + 255 + ], + "id": "Name: user a2gbifl43u1lkj\nType: PERSON" + }, + { + "entity_name": "foreign born latinos", + "entity_type": "PERSON", + "description": "foreign born latinos are a demographic group mentioned in the example query regarding population surveys", + "source_ids": [ + 255 + ], + "id": "Name: foreign born latinos\nType: PERSON" + }, + { + "entity_name": "latinos interviewed by cellphone", + "entity_type": "PERSON", + "description": "latinos interviewed by cellphone are a demographic group mentioned in the example query regarding population surveys", + "source_ids": [ + 255 + ], + "id": "Name: latinos interviewed by cellphone\nType: PERSON" + }, + { + "entity_name": "soft labeled personality embedding matrix", + "entity_type": "PRODUCT", + "description": "the soft labeled personality embedding matrix is a data structure containing personality vectors and their associated colors", + "source_ids": [ + 255 + ], + "id": "Name: soft labeled personality embedding matrix\nType: PRODUCT" + }, + { + "entity_name": "receptiviti score", + "entity_type": "EVALUATION_METRIC", + "description": "the receptiviti score is a metric used to evaluate personality vectors in the context of the example query", + "source_ids": [ + 255 + ], + "id": "Name: receptiviti score\nType: EVALUATION_METRIC" + }, + { + "entity_name": "population", + "entity_type": "MEASUREMENT", + "description": "population refers to the count of individuals in a specific demographic group within a survey", + "source_ids": [ + 255 + ], + "id": "Name: population\nType: MEASUREMENT" + }, + { + "entity_name": "query decomposition expert", + "entity_type": "PROFESSION", + "description": "the query decomposition expert is the role assigned to the ai to break down complex questions into atomic sub questions", + "source_ids": [ + 255 + ], + "id": "Name: query decomposition expert\nType: PROFESSION" + }, + { + "entity_name": "complex question", + "entity_type": "TASK_OR_PROBLEM", + "description": "a complex question is the input task that needs to be broken down into simple sub questions", + "source_ids": [ + 255 + ], + "id": "Name: complex question\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "simple atomic sub questions", + "entity_type": "TASK_OR_PROBLEM", + "description": "simple atomic sub questions are the output components of the decomposition process each being a direct information retrieval task", + "source_ids": [ + 255 + ], + "id": "Name: simple atomic sub questions\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "retrieval sub question", + "entity_type": "TASK_OR_PROBLEM", + "description": "a retrieval sub question is a specific type of sub question that requires looking up a specific fact number or value in the document", + "source_ids": [ + 255 + ], + "id": "Name: retrieval sub question\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "synthesis question", + "entity_type": "TASK_OR_PROBLEM", + "description": "a synthesis question is a specific type of sub question that requires comparing calculating or combining answers from previous retrieval questions", + "source_ids": [ + 255 + ], + "id": "Name: synthesis question\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "sub questions", + "entity_type": "SECTION_TITLE", + "description": "the sub questions key is the container within the json object that holds the list of decomposed questions", + "source_ids": [ + 255 + ], + "id": "Name: sub questions\nType: SECTION_TITLE" + }, + { + "entity_name": "question", + "entity_type": "SECTION_TITLE", + "description": "the question key within each sub question object holds the string of the actual question", + "source_ids": [ + 255 + ], + "id": "Name: question\nType: SECTION_TITLE" + }, + { + "entity_name": "type", + "entity_type": "SECTION_TITLE", + "description": "the type key within each sub question object specifies whether the question is retrieval or synthesis", + "source_ids": [ + 255 + ], + "id": "Name: type\nType: SECTION_TITLE" + }, + { + "entity_name": "example 1", + "entity_type": "EVENT", + "description": "example 1 is a demonstration of correct decomposition with independent lookups provided in the text", + "source_ids": [ + 255 + ], + "id": "Name: example 1\nType: EVENT" + }, + { + "entity_name": "example 2", + "entity_type": "EVENT", + "description": "example 2 is a demonstration of decomposition with retrieval and synthesis steps provided in the text", + "source_ids": [ + 255 + ], + "id": "Name: example 2\nType: EVENT" + }, + { + "entity_name": "personality vector", + "entity_type": "PRODUCT", + "description": "a personality vector is a data element within the soft labeled personality embedding matrix", + "source_ids": [ + 255 + ], + "id": "Name: personality vector\nType: PRODUCT" + }, + { + "entity_name": "color", + "entity_type": "COLOR", + "description": "color is an attribute mapped to personality vectors in the soft labeled personality embedding matrix", + "source_ids": [ + 255 + ], + "id": "Name: color\nType: COLOR" + }, + { + "entity_name": "survey", + "entity_type": "EVENT", + "description": "the survey is the context in which population data for latinos is collected in example 2", + "source_ids": [ + 255 + ], + "id": "Name: survey\nType: EVENT" + }, + { + "entity_name": "report", + "entity_type": "BOOK", + "description": "The report is a book that serves as a document referenced in example 2, containing population data, and is also cited in an example query regarding chapters.", + "source_ids": [ + 258, + 255 + ], + "id": "Name: report\nType: BOOK" + }, + { + "entity_name": "query decomposition", + "entity_type": "TASK_OR_PROBLEM", + "description": "query decomposition is the task or problem for which the prompt in figure 11 is designed", + "source_ids": [ + 256 + ], + "id": "Name: query decomposition\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "17", + "entity_type": "NUMBER", + "description": "17 is a number mentioned in the text though its specific context or role is not defined", + "source_ids": [ + 257 + ], + "id": "Name: 17\nType: NUMBER" + }, + { + "entity_name": "ai assistant", + "entity_type": "PERSON", + "description": "an ai assistant described as highly specialized with the function of analyzing a global query", + "source_ids": [ + 258 + ], + "id": "Name: ai assistant\nType: PERSON" + }, + { + "entity_name": "global query", + "entity_type": "TASK_OR_PROBLEM", + "description": "a query that the ai assistant is designed to analyze to determine filtering steps and aggregation operations", + "source_ids": [ + 258 + ], + "id": "Name: global query\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "filters", + "entity_type": "TASK_OR_PROBLEM", + "description": "a list of filtering steps to be applied which can include sections images tables or pages", + "source_ids": [ + 258 + ], + "id": "Name: filters\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "operation", + "entity_type": "TASK_OR_PROBLEM", + "description": "the final aggregation operation to be performed such as count list summarize or analyze", + "source_ids": [ + 258 + ], + "id": "Name: operation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "methodology", + "entity_type": "SECTION_TITLE", + "description": "a specific section title mentioned in an example query regarding data augmentation", + "source_ids": [ + 258 + ], + "id": "Name: methodology\nType: SECTION_TITLE" + }, + { + "entity_name": "paper", + "entity_type": "BOOK", + "description": "a document referenced in an example query regarding figures on specific pages", + "source_ids": [ + 258 + ], + "id": "Name: paper\nType: BOOK" + }, + { + "entity_name": "assistant", + "entity_type": "PERSON", + "description": "the assistant is the entity responding to the user with a json object", + "source_ids": [ + 258 + ], + "id": "Name: assistant\nType: PERSON" + }, + { + "entity_name": "chapter", + "entity_type": "SECTION_TITLE", + "description": "a structural part of a document mentioned in the example about counting chapters", + "source_ids": [ + 258 + ], + "id": "Name: chapter\nType: SECTION_TITLE" + }, + { + "entity_name": "appendices", + "entity_type": "SECTION_TITLE", + "description": "a structural part of a document mentioned in the definition of section filters", + "source_ids": [ + 258 + ], + "id": "Name: appendices\nType: SECTION_TITLE" + }, + { + "entity_name": "data augmentation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "a specific topic discussed in the methodology section in the example query", + "source_ids": [ + 258 + ], + "id": "Name: data augmentation\nType: METHOD_OR_TECHNIQUE" + }, + { + "entity_name": "discussion", + "entity_type": "TASK_OR_PROBLEM", + "description": "the content regarding data augmentation that needs to be summarized", + "source_ids": [ + 258 + ], + "id": "Name: discussion\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "3 10", + "entity_type": "MEASUREMENT", + "description": "a specific page range mentioned as a filter value in the example query", + "source_ids": [ + 258 + ], + "id": "Name: 3 10\nType: MEASUREMENT" + }, + { + "entity_name": "count", + "entity_type": "TASK_OR_PROBLEM", + "description": "an aggregation operation used to count items", + "source_ids": [ + 258 + ], + "id": "Name: count\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "list", + "entity_type": "TASK_OR_PROBLEM", + "description": "an aggregation operation used to list items", + "source_ids": [ + 258 + ], + "id": "Name: list\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "summarize", + "entity_type": "TASK_OR_PROBLEM", + "description": "an aggregation operation used to summarize content", + "source_ids": [ + 258 + ], + "id": "Name: summarize\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "analyze", + "entity_type": "TASK_OR_PROBLEM", + "description": "an aggregation operation used to analyze content", + "source_ids": [ + 258 + ], + "id": "Name: analyze\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "page", + "entity_type": "MEASUREMENT", + "description": "a filter type used for specific page numbers", + "source_ids": [ + 258 + ], + "id": "Name: page\nType: MEASUREMENT" + }, + { + "entity_name": "null", + "entity_type": "TASK_OR_PROBLEM", + "description": "a value indicating that no specific value is provided for image or table filters", + "source_ids": [ + 258 + ], + "id": "Name: null\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "18", + "entity_type": "NUMBER", + "description": "18 is a number mentioned in the text though its specific context or meaning is not provided", + "source_ids": [ + 260 + ], + "id": "Name: 18\nType: NUMBER" + }, + { + "entity_name": "entity resolution adjudicator", + "entity_type": "PERSON", + "description": "entity resolution adjudicator is an expert role tasked with determining if a new entity refers to the same real world concept as candidate entities", + "source_ids": [ + 262 + ], + "id": "Name: entity resolution adjudicator\nType: PERSON" + }, + { + "entity_name": "candidate entities", + "entity_type": "TASK_OR_PROBLEM", + "description": "candidate entities are a list of semantically similar entities retrieved from an existing knowledge base for comparison", + "source_ids": [ + 262 + ], + "id": "Name: candidate entities\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "knowledge graph", + "entity_type": "TASK_OR_PROBLEM", + "description": "knowledge graph is the existing database from which candidate entities are retrieved", + "source_ids": [ + 262 + ], + "id": "Name: knowledge graph\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "knowledge base", + "entity_type": "TASK_OR_PROBLEM", + "description": "knowledge base is the source of semantically similar candidate entities", + "source_ids": [ + 262 + ], + "id": "Name: knowledge base\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "id", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The id is a unique identifier used to reference candidate entities in the output and serves as the identifier for the candidate determined to be an exact match.", + "source_ids": [ + 276, + 262 + ], + "id": "Name: id\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "1", + "entity_type": "VALUE", + "description": "1 is a specific value indicating that no matching candidate was found for the new entity", + "source_ids": [ + 262 + ], + "id": "Name: 1\nType: VALUE" + }, + { + "entity_name": "text", + "entity_type": "DATASET_OR_CORPUS", + "description": "text is the source material from which the new entity is recently extracted", + "source_ids": [ + 262 + ], + "id": "Name: text\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "explanation", + "entity_type": "TASK_OR_PROBLEM", + "description": "An explanation is a brief, one-sentence string that serves as a task or problem by providing the reasoning behind a decision, specifically justifying the outcome in contexts such as entity matching.", + "source_ids": [ + 277, + 262 + ], + "id": "Name: explanation\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "field by field adjudication", + "entity_type": "TASK_OR_PROBLEM", + "description": "field by field adjudication is a task described as a method to determine a match by evaluating each field with a specific focus", + "source_ids": [ + 266 + ], + "id": "Name: field by field adjudication\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "entity name", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity name is a placeholder term used to denote the name of an entity in the context of matching criteria", + "source_ids": [ + 267 + ], + "id": "Name: entity name\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "large language model", + "entity_type": "TECHNOLOGY", + "description": "large language model is the full form of the abbreviation llm used as an example of a direct abbreviation match", + "source_ids": [ + 267 + ], + "id": "Name: large language model\nType: TECHNOLOGY" + }, + { + "entity_name": "event detection", + "entity_type": "TASK_OR_PROBLEM", + "description": "event detection is a task mentioned as a distinct concept that should not be matched with named entity recognition", + "source_ids": [ + 267 + ], + "id": "Name: event detection\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "named entity recognition", + "entity_type": "TASK_OR_PROBLEM", + "description": "named entity recognition is a task mentioned as a distinct concept that should not be matched with event detection", + "source_ids": [ + 267 + ], + "id": "Name: named entity recognition\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "high importance", + "entity_type": "CONCEPT", + "description": "high importance is a criterion mentioned for determining the similarity of entity names", + "source_ids": [ + 267 + ], + "id": "Name: high importance\nType: CONCEPT" + }, + { + "entity_name": "distinct concepts", + "entity_type": "CONCEPT", + "description": "distinct concepts refers to parallel concepts that are explicitly excluded from being considered a match", + "source_ids": [ + 267 + ], + "id": "Name: distinct concepts\nType: CONCEPT" + }, + { + "entity_name": "entity type", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity type is a task or problem described as having medium importance in the context of type compatibility", + "source_ids": [ + 268 + ], + "id": "Name: entity type\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "description", + "entity_type": "CONCEPT", + "description": "description refers to the contextual importance of text segments which may differ as they are extracted from different parts of a document", + "source_ids": [ + 269 + ], + "id": "Name: description\nType: CONCEPT" + }, + { + "entity_name": "contextual importance", + "entity_type": "CONCEPT", + "description": "contextual importance is a property of descriptions that requires looking past surface level text similarity to determine if they describe the same underlying object or concept", + "source_ids": [ + 269 + ], + "id": "Name: contextual importance\nType: CONCEPT" + }, + { + "entity_name": "be strict and conservative", + "entity_type": "TASK_OR_PROBLEM", + "description": "be strict and conservative is a guideline or instruction regarding the standard for matching emphasizing high standards to avoid corrupting the knowledge graph", + "source_ids": [ + 270 + ], + "id": "Name: be strict and conservative\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "apple", + "entity_type": "PRODUCT", + "description": "apple is mentioned as an example of a fruit", + "source_ids": [ + 272 + ], + "id": "Name: apple\nType: PRODUCT" + }, + { + "entity_name": "apple inc", + "entity_type": "ORGANIZATION", + "description": "apple inc is mentioned as an example of a company", + "source_ids": [ + 272 + ], + "id": "Name: apple inc\nType: ORGANIZATION" + }, + { + "entity_name": "when in doubt", + "entity_type": "TASK_OR_PROBLEM", + "description": "when in doubt is a condition mentioned in the text that triggers a specific output requirement", + "source_ids": [ + 273 + ], + "id": "Name: when in doubt\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "1", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 273 + ], + "id": "Name: 1\nType: UNKNOWN" + }, + { + "entity_name": "json", + "entity_type": "FILE_TYPE", + "description": "json is a file format mentioned as the required output format for the answer", + "source_ids": [ + 275 + ], + "id": "Name: json\nType: FILE_TYPE" + }, + { + "entity_name": "output", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 275 + ], + "id": "Name: output\nType: UNKNOWN" + }, + { + "entity_name": "select id", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The select id is a parameter or variable defined as an integer within the provided text structure, representing the integer identifier for a candidate that has been determined to be an exact match.", + "source_ids": [ + 281, + 276 + ], + "id": "Name: select id\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "exact match", + "entity_type": "TASK_OR_PROBLEM", + "description": "exact match refers to the condition where a candidate is determined to be identical to a reference", + "source_ids": [ + 276 + ], + "id": "Name: exact match\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "1", + "entity_type": "MONEY", + "description": "1 is a specific integer value used to indicate that no exact match was found", + "source_ids": [ + 276 + ], + "id": "Name: 1\nType: MONEY" + }, + { + "entity_name": "candidate", + "entity_type": "TASK_OR_PROBLEM", + "description": "candidate refers to an item being evaluated to determine if it is an exact match", + "source_ids": [ + 276 + ], + "id": "Name: candidate\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "integer", + "entity_type": "MEASUREMENT", + "description": "An integer is a data type specified for the select id value and represents the specific type of output requested for the selection task.", + "source_ids": [ + 282, + 276 + ], + "id": "Name: integer\nType: MEASUREMENT" + }, + { + "entity_name": "explanation", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "explanation is a parameter or variable defined as a string in the provided text structure", + "source_ids": [ + 281 + ], + "id": "Name: explanation\nType: PARAMETER_OR_VARIABLE" + }, + { + "entity_name": "example 1", + "entity_type": "TASK_OR_PROBLEM", + "description": "example 1 is a task or problem scenario where a match was found", + "source_ids": [ + 281 + ], + "id": "Name: example 1\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "example 2", + "entity_type": "TASK_OR_PROBLEM", + "description": "example 2 is a task or problem scenario where no match was found", + "source_ids": [ + 281 + ], + "id": "Name: example 2\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "selection task", + "entity_type": "TASK_OR_PROBLEM", + "description": "the selection task is the activity described in the text that requires processing the provided data", + "source_ids": [ + 282 + ], + "id": "Name: selection task\nType: TASK_OR_PROBLEM" + }, + { + "entity_name": "examples", + "entity_type": "DATASET_OR_CORPUS", + "description": "examples are data instances that were omitted from the text due to space constraints", + "source_ids": [ + 284 + ], + "id": "Name: examples\nType: DATASET_OR_CORPUS" + }, + { + "entity_name": "19", + "entity_type": "NUMBER", + "description": "19 is a number mentioned in the text though its specific context or meaning is not provided", + "source_ids": [ + 285 + ], + "id": "Name: 19\nType: NUMBER" + } + ], + "links": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'BookRAG' is the primary subject defined in the main title.", + "source_ids": [ + 1 + ], + "source": "Name: bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents\nType: SECTION_TITLE", + "target": "Name: bookrag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "hierarchical structure-aware index-based approach", + "tgt_entity_name": "bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents", + "relation_name": "", + "weight": 10.0, + "description": "The methodological approach is a key component described in the main title.", + "source_ids": [ + 1 + ], + "source": "Name: bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents\nType: SECTION_TITLE", + "target": "Name: hierarchical structure-aware index-based approach\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "retrieval-augmented generation", + "tgt_entity_name": "bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents", + "relation_name": "", + "weight": 10.0, + "description": "The application domain or task is a central theme of the main title.", + "source_ids": [ + 1 + ], + "source": "Name: bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents\nType: SECTION_TITLE", + "target": "Name: retrieval-augmented generation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "complex documents", + "tgt_entity_name": "bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents", + "relation_name": "", + "weight": 10.0, + "description": "The target data scope is explicitly mentioned as a focus area in the main title.", + "source_ids": [ + 1 + ], + "source": "Name: bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents\nType: SECTION_TITLE", + "target": "Name: complex documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "shu wang", + "tgt_entity_name": "the chinese university of hong kong shenzhen", + "relation_name": "", + "weight": 10.0, + "description": "shu wang is affiliated with the chinese university of hong kong shenzhen", + "source_ids": [ + 2 + ], + "source": "Name: shu wang\nType: PERSON", + "target": "Name: the chinese university of hong kong shenzhen\nType: ORGANIZATION" + }, + { + "src_entity_name": "shu wang", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "shu wang is an author of the bookrag paper", + "source_ids": [ + 5 + ], + "source": "Name: shu wang\nType: PERSON", + "target": "Name: bookrag\nType: PRODUCT" + }, + { + "src_entity_name": "shu wang", + "tgt_entity_name": "yingli zhou", + "relation_name": "", + "weight": 8.0, + "description": "shu wang and yingli zhou are co authors on the bookrag paper", + "source_ids": [ + 5 + ], + "source": "Name: shu wang\nType: PERSON", + "target": "Name: yingli zhou\nType: PERSON" + }, + { + "src_entity_name": "shu wang", + "tgt_entity_name": "yixiang fang", + "relation_name": "", + "weight": 8.0, + "description": "shu wang and yixiang fang are co authors on the bookrag paper", + "source_ids": [ + 5 + ], + "source": "Name: shu wang\nType: PERSON", + "target": "Name: yixiang fang\nType: PERSON" + }, + { + "src_entity_name": "yingli zhou", + "tgt_entity_name": "the chinese university of hong kong shenzhen", + "relation_name": "", + "weight": 10.0, + "description": "yingli zhou is affiliated with the chinese university of hong kong shenzhen", + "source_ids": [ + 2 + ], + "source": "Name: yingli zhou\nType: PERSON", + "target": "Name: the chinese university of hong kong shenzhen\nType: ORGANIZATION" + }, + { + "src_entity_name": "yingli zhou", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "yingli zhou is an author of the bookrag paper", + "source_ids": [ + 5 + ], + "source": "Name: yingli zhou\nType: PERSON", + "target": "Name: bookrag\nType: PRODUCT" + }, + { + "src_entity_name": "yingli zhou", + "tgt_entity_name": "yixiang fang", + "relation_name": "", + "weight": 8.0, + "description": "yingli zhou and yixiang fang are co authors on the bookrag paper", + "source_ids": [ + 5 + ], + "source": "Name: yingli zhou\nType: PERSON", + "target": "Name: yixiang fang\nType: PERSON" + }, + { + "src_entity_name": "yixiang fang", + "tgt_entity_name": "the chinese university of hong kong shenzhen", + "relation_name": "", + "weight": 10.0, + "description": "yixiang fang is affiliated with the chinese university of hong kong shenzhen", + "source_ids": [ + 2 + ], + "source": "Name: yixiang fang\nType: PERSON", + "target": "Name: the chinese university of hong kong shenzhen\nType: ORGANIZATION" + }, + { + "src_entity_name": "yixiang fang", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "yixiang fang is an author of the bookrag paper", + "source_ids": [ + 5 + ], + "source": "Name: yixiang fang\nType: PERSON", + "target": "Name: bookrag\nType: PRODUCT" + }, + { + "src_entity_name": "retrievalaugmented generation", + "tgt_entity_name": "large language models", + "relation_name": "", + "weight": 9.0, + "description": "retrievalaugmented generation is used to boost the performance of large language models", + "source_ids": [ + 2 + ], + "source": "Name: large language models\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: retrievalaugmented generation\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrievalaugmented generation", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is a novel approach within the category of retrievalaugmented generation", + "source_ids": [ + 2 + ], + "source": "Name: retrievalaugmented generation\nType: METHOD_OR_TECHNIQUE", + "target": "Name: bookrag\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "retrievalaugmented generation", + "tgt_entity_name": "industry", + "relation_name": "", + "weight": 7.0, + "description": "industry has attracted attention to retrievalaugmented generation", + "source_ids": [ + 2 + ], + "source": "Name: retrievalaugmented generation\nType: METHOD_OR_TECHNIQUE", + "target": "Name: industry\nType: ORGANIZATION" + }, + { + "src_entity_name": "retrievalaugmented generation", + "tgt_entity_name": "academia", + "relation_name": "", + "weight": 7.0, + "description": "academia has attracted attention to retrievalaugmented generation", + "source_ids": [ + 2 + ], + "source": "Name: retrievalaugmented generation\nType: METHOD_OR_TECHNIQUE", + "target": "Name: academia\nType: ORGANIZATION" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "question answering", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is designed to improve performance on the question answering task", + "source_ids": [ + 2 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: question answering\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "books", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is specifically targeted for documents like books that have hierarchical structures", + "source_ids": [ + 2 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: books\nType: BOOK" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes the bookindex structure to exploit logical hierarchies and trace entity relations", + "source_ids": [ + 2 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: bookindex\nType: SOFTWARE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "information foraging theory", + "relation_name": "", + "weight": 8.0, + "description": "the agent based query method in bookrag is inspired by information foraging theory", + "source_ids": [ + 2 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: information foraging theory\nType: SCIENTIFIC_THEORY" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "three widely adopted benchmarks", + "relation_name": "", + "weight": 9.0, + "description": "bookrag was evaluated and demonstrated state of the art performance on three widely adopted benchmarks", + "source_ids": [ + 2 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: three widely adopted benchmarks\nType: BENCHMARK" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "booklets", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is specifically targeted for documents like booklets that have hierarchical structures", + "source_ids": [ + 2 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: booklets\nType: BOOK" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "handbooks", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is specifically targeted for documents like handbooks that have hierarchical structures", + "source_ids": [ + 2 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: handbooks\nType: BOOK" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "bookrag significantly outperforms baselines in retrieval recall", + "source_ids": [ + 2 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: retrieval recall\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "qa accuracy", + "relation_name": "", + "weight": 9.0, + "description": "bookrag significantly outperforms baselines in qa accuracy", + "source_ids": [ + 2 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: qa accuracy\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "efficiency", + "relation_name": "", + "weight": 7.0, + "description": "bookrag maintains competitive efficiency", + "source_ids": [ + 2 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: efficiency\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "baselines", + "relation_name": "", + "weight": 9.0, + "description": "bookrag significantly outperforms baselines in both retrieval recall and qa accuracy", + "source_ids": [ + 2 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: baselines\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "bookrag constructs the bookindex by integrating other components", + "source_ids": [ + 25 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: bookindex\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "hierarchical tree", + "relation_name": "", + "weight": 9.0, + "description": "bookrag integrates a hierarchical tree of document layout blocks", + "source_ids": [ + 25 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: hierarchical tree\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "bookrag integrates a kg storing fine grained entity relations", + "source_ids": [ + 25 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: kg\nType: SOFTWARE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "document layout blocks", + "relation_name": "", + "weight": 8.0, + "description": "bookrag integrates document layout blocks via a hierarchical tree", + "source_ids": [ + 25 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: document layout blocks\nType: MATERIAL" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "entity relations", + "relation_name": "", + "weight": 8.0, + "description": "bookrag utilizes a kg that stores entity relations", + "source_ids": [ + 25 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: entity relations\nType: CONCEPT" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to BookRAG", + "source_ids": [ + 159 + ], + "source": "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "books", + "relation_name": "", + "weight": 8.0, + "description": "bookindex is built by extracting a hierarchical tree from documents such as books", + "source_ids": [ + 2 + ], + "source": "Name: bookindex\nType: SOFTWARE", + "target": "Name: books\nType: BOOK" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "tree", + "relation_name": "", + "weight": 9.0, + "description": "bookindex is built by extracting a hierarchical tree from the document", + "source_ids": [ + 2 + ], + "source": "Name: bookindex\nType: SOFTWARE", + "target": "Name: tree\nType: SOFTWARE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "graph", + "relation_name": "", + "weight": 8.0, + "description": "bookindex uses a graph to capture the intricate relationships between entities", + "source_ids": [ + 2 + ], + "source": "Name: bookindex\nType: SOFTWARE", + "target": "Name: graph\nType: SOFTWARE" + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 9.0, + "description": "the retrieval process using selector is grounded in information foraging theory", + "source_ids": [ + 22 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: selector\nType: SOFTWARE" + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 9.0, + "description": "the retrieval process using reasoner is grounded in information foraging theory", + "source_ids": [ + 22 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: reasoner\nType: SOFTWARE" + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "retrieval workflows", + "relation_name": "", + "weight": 8.0, + "description": "the retrieval process mimics foraging as described by information foraging theory", + "source_ids": [ + 22 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: retrieval workflows\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "agent based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "information foraging theory serves as the inspiration for the agent based retrieval approach", + "source_ids": [ + 26 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: agent based retrieval\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "complex document qa", + "tgt_entity_name": "information foraging theory", + "relation_name": "", + "weight": 8.0, + "description": "the text states that the research problem of complex document qa is formalized alongside the introduction of information foraging theory", + "source_ids": [ + 35 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: complex document qa\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "ift", + "relation_name": "", + "weight": 10.0, + "description": "ift is the abbreviation used for information foraging theory in the text", + "source_ids": [ + 35 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: ift\nType: SCIENTIFIC_THEORY" + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "3.2 information foraging theory", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Information Foraging Theory' is the primary subject matter detailed in section 3.2.", + "source_ids": [ + 41 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: 3.2 information foraging theory\nType: SECTION_TITLE" + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "animal foraging", + "relation_name": "", + "weight": 10.0, + "description": "information foraging theory uses animal foraging as an analogy to explain information access", + "source_ids": [ + 42 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: animal foraging\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "information scent", + "relation_name": "", + "weight": 9.0, + "description": "information foraging theory suggests that users follow information scent cues to navigate content", + "source_ids": [ + 42 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: information scent\nType: CONCEPT" + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 9.0, + "description": "information foraging theory describes information patches as clusters of content that users navigate between", + "source_ids": [ + 42 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: information patches\nType: CONCEPT" + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "reference 42", + "relation_name": "", + "weight": 8.0, + "description": "information foraging theory is cited with reference number 42 in the text", + "source_ids": [ + 42 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: reference 42\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "5 agent-based retrieval", + "relation_name": "", + "weight": 9.5, + "description": "'Information Foraging Theory' serves as the foundational inspiration for the methods discussed in section 5.", + "source_ids": [ + 78 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: 5 agent-based retrieval\nType: SECTION_TITLE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "information foraging theory", + "relation_name": "", + "weight": 10.0, + "description": "bookrag embodies the cognitive principles of information foraging theory during its execution phase", + "source_ids": [ + 124 + ], + "source": "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "target": "Name: bookrag\nType: SOFTWARE" + }, + { + "src_entity_name": "large language models", + "tgt_entity_name": "question answering", + "relation_name": "", + "weight": 9.0, + "description": "large language models have revolutionized the question answering system", + "source_ids": [ + 9 + ], + "source": "Name: question answering\nType: TASK_OR_PROBLEM", + "target": "Name: large language models\nType: TECHNOLOGY" + }, + { + "src_entity_name": "industry", + "tgt_entity_name": "question answering", + "relation_name": "", + "weight": 7.0, + "description": "the industry is building question answering systems to assist users and reduce manual effort", + "source_ids": [ + 9 + ], + "source": "Name: question answering\nType: TASK_OR_PROBLEM", + "target": "Name: industry\nType: ORGANIZATION" + }, + { + "src_entity_name": "question answering", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 10.0, + "description": "question answering aims to answer queries based on documents", + "source_ids": [ + 37 + ], + "source": "Name: question answering\nType: TASK_OR_PROBLEM", + "target": "Name: document\nType: PRODUCT" + }, + { + "src_entity_name": "question answering", + "tgt_entity_name": "user query", + "relation_name": "", + "weight": 10.0, + "description": "question answering processes user queries to generate answers", + "source_ids": [ + 37 + ], + "source": "Name: question answering\nType: TASK_OR_PROBLEM", + "target": "Name: user query\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "question answering", + "tgt_entity_name": "answer", + "relation_name": "", + "weight": 10.0, + "description": "the goal of question answering is to generate an accurate answer", + "source_ids": [ + 37 + ], + "source": "Name: question answering\nType: TASK_OR_PROBLEM", + "target": "Name: answer\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "question answering", + "tgt_entity_name": "references 5 11 33", + "relation_name": "", + "weight": 8.0, + "description": "the problem of question answering is associated with references 5 11 and 33", + "source_ids": [ + 37 + ], + "source": "Name: question answering\nType: TASK_OR_PROBLEM", + "target": "Name: references 5 11 33\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "question answering", + "relation_name": "", + "weight": 9.0, + "description": "the survey focuses on the task of question answering", + "source_ids": [ + 195 + ], + "source": "Name: question answering\nType: TASK_OR_PROBLEM", + "target": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK" + }, + { + "src_entity_name": "question answering", + "tgt_entity_name": "visually rich documents", + "relation_name": "", + "weight": 8.0, + "description": "question answering is performed over visually rich documents in the context of the survey", + "source_ids": [ + 195 + ], + "source": "Name: question answering\nType: TASK_OR_PROBLEM", + "target": "Name: visually rich documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "question answering", + "relation_name": "", + "weight": 10.0, + "description": "g retriever is designed to solve the problem of question answering", + "source_ids": [ + 211 + ], + "source": "Name: question answering\nType: TASK_OR_PROBLEM", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "industry", + "tgt_entity_name": "large language models", + "relation_name": "", + "weight": 8.0, + "description": "the industry is adopting large language models to build question answering systems", + "source_ids": [ + 9 + ], + "source": "Name: industry\nType: ORGANIZATION", + "target": "Name: large language models\nType: TECHNOLOGY" + }, + { + "src_entity_name": "industry", + "tgt_entity_name": "qa system", + "relation_name": "", + "weight": 9.0, + "description": "the industry builds qa systems to assist users and reduce manual effort", + "source_ids": [ + 9 + ], + "source": "Name: industry\nType: ORGANIZATION", + "target": "Name: qa system\nType: PRODUCT" + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "table of contents", + "relation_name": "", + "weight": 8.0, + "description": "the hierarchical tree serves as the role of the table of contents", + "source_ids": [ + 2 + ], + "source": "Name: tree\nType: SOFTWARE", + "target": "Name: table of contents\nType: SOFTWARE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 10.0, + "description": "bookrag s retrieval recall is the specific metric measured to demonstrate its performance", + "source_ids": [ + 157 + ], + "source": "Name: retrieval recall\nType: EVALUATION_METRIC", + "target": "Name: bookrag\nType: PRODUCT" + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 7.0, + "description": "the high quality kg is a feature that contributes to the performance in retrieval recall", + "source_ids": [ + 23 + ], + "source": "Name: retrieval recall\nType: EVALUATION_METRIC", + "target": "Name: kg\nType: PRODUCT" + }, + { + "src_entity_name": "three widely adopted datasets", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 8.0, + "description": "the three widely adopted datasets are used to measure the retrieval recall performance of the system", + "source_ids": [ + 23 + ], + "source": "Name: retrieval recall\nType: EVALUATION_METRIC", + "target": "Name: three widely adopted datasets\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "state of the art baselines", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 7.0, + "description": "state of the art baselines are evaluated on retrieval recall to compare against bookrag", + "source_ids": [ + 23 + ], + "source": "Name: retrieval recall\nType: EVALUATION_METRIC", + "target": "Name: state of the art baselines\nType: PRODUCT" + }, + { + "src_entity_name": "pdf parsing", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 10.0, + "description": "retrieval recall is the specific metric used to evaluate the pdf parsing method", + "source_ids": [ + 144 + ], + "source": "Name: retrieval recall\nType: EVALUATION_METRIC", + "target": "Name: pdf parsing\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "ground truth", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "retrieval recall is measured against the ground truth", + "source_ids": [ + 144 + ], + "source": "Name: retrieval recall\nType: EVALUATION_METRIC", + "target": "Name: ground truth\nType: CONCEPT" + }, + { + "src_entity_name": "query", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "retrieval recall is recorded for a specific query when a pdf parsing error occurs", + "source_ids": [ + 144 + ], + "source": "Name: retrieval recall\nType: EVALUATION_METRIC", + "target": "Name: query\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "table 6", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 10.0, + "description": "table 6 displays the comparison results for the retrieval recall metric", + "source_ids": [ + 155 + ], + "source": "Name: retrieval recall\nType: EVALUATION_METRIC", + "target": "Name: table 6\nType: TABLE" + }, + { + "src_entity_name": "retrieval recall", + "tgt_entity_name": "layout based methods", + "relation_name": "", + "weight": 9.0, + "description": "retrieval recall is the specific metric used to evaluate the layout based methods", + "source_ids": [ + 155 + ], + "source": "Name: retrieval recall\nType: EVALUATION_METRIC", + "target": "Name: layout based methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "qa accuracy", + "relation_name": "", + "weight": 10.0, + "description": "bookrag achieves superior performance in qa accuracy as demonstrated by experimental results", + "source_ids": [ + 23 + ], + "source": "Name: qa accuracy\nType: EVALUATION_METRIC", + "target": "Name: bookrag\nType: PRODUCT" + }, + { + "src_entity_name": "agent based retrieval mechanism", + "tgt_entity_name": "qa accuracy", + "relation_name": "", + "weight": 7.0, + "description": "the agent based retrieval mechanism is a feature that contributes to the performance in qa accuracy", + "source_ids": [ + 23 + ], + "source": "Name: qa accuracy\nType: EVALUATION_METRIC", + "target": "Name: agent based retrieval mechanism\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "three widely adopted datasets", + "tgt_entity_name": "qa accuracy", + "relation_name": "", + "weight": 8.0, + "description": "the three widely adopted datasets are used to measure the qa accuracy performance of the system", + "source_ids": [ + 23 + ], + "source": "Name: qa accuracy\nType: EVALUATION_METRIC", + "target": "Name: three widely adopted datasets\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "state of the art baselines", + "tgt_entity_name": "qa accuracy", + "relation_name": "", + "weight": 7.0, + "description": "state of the art baselines are evaluated on qa accuracy to compare against bookrag", + "source_ids": [ + 23 + ], + "source": "Name: qa accuracy\nType: EVALUATION_METRIC", + "target": "Name: state of the art baselines\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "efficiency", + "relation_name": "", + "weight": 8.0, + "description": "the efficiency of bookrag is evaluated and compared in the experiments", + "source_ids": [ + 137 + ], + "source": "Name: efficiency\nType: EVALUATION_METRIC", + "target": "Name: bookrag\nType: PRODUCT" + }, + { + "src_entity_name": "baseline methods", + "tgt_entity_name": "efficiency", + "relation_name": "", + "weight": 8.0, + "description": "the efficiency of baseline methods is evaluated and compared in the experiments", + "source_ids": [ + 137 + ], + "source": "Name: efficiency\nType: EVALUATION_METRIC", + "target": "Name: baseline methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "document qa tasks", + "tgt_entity_name": "efficiency", + "relation_name": "", + "weight": 7.0, + "description": "efficiency is measured specifically on document qa tasks", + "source_ids": [ + 137 + ], + "source": "Name: efficiency\nType: EVALUATION_METRIC", + "target": "Name: document qa tasks\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "reference format", + "relation_name": "", + "weight": 9.0, + "description": "pvldb is associated with a specific reference format mentioned in the text", + "source_ids": [ + 4 + ], + "source": "Name: pvldb\nType: PUBLICATION_VENUE", + "target": "Name: reference format\nType: SECTION_TITLE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "pvldb", + "relation_name": "", + "weight": 10.0, + "description": "bookrag was published in the pvldb journal", + "source_ids": [ + 5 + ], + "source": "Name: pvldb\nType: PUBLICATION_VENUE", + "target": "Name: bookrag\nType: PRODUCT" + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 9.0, + "description": "pvldb published the bookrag paper in the year 2025", + "source_ids": [ + 5 + ], + "source": "Name: pvldb\nType: PUBLICATION_VENUE", + "target": "Name: 2025\nType: DATE" + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "19", + "relation_name": "", + "weight": 8.0, + "description": "pvldb volume 19 contains the paper", + "source_ids": [ + 5 + ], + "source": "Name: pvldb\nType: PUBLICATION_VENUE", + "target": "Name: 19\nType: MEASUREMENT" + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 8.0, + "description": "pvldb issue 1 contains the paper", + "source_ids": [ + 5 + ], + "source": "Name: pvldb\nType: PUBLICATION_VENUE", + "target": "Name: 1\nType: MEASUREMENT" + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "xxx xxx", + "relation_name": "", + "weight": 8.0, + "description": "the paper appears on pages xxx xxx in pvldb", + "source_ids": [ + 5 + ], + "source": "Name: pvldb\nType: PUBLICATION_VENUE", + "target": "Name: xxx xxx\nType: MEASUREMENT" + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "xx xx xxx xx", + "relation_name": "", + "weight": 8.0, + "description": "the paper in pvldb has the doi xx xx xxx xx", + "source_ids": [ + 5 + ], + "source": "Name: pvldb\nType: PUBLICATION_VENUE", + "target": "Name: xx xx xxx xx\nType: MEASUREMENT" + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "artifact availability", + "relation_name": "", + "weight": 8.0, + "description": "pvldb is the venue where the topic of artifact availability is addressed", + "source_ids": [ + 6 + ], + "source": "Name: pvldb\nType: PUBLICATION_VENUE", + "target": "Name: artifact availability\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is an approach for retrieval augmented generation", + "source_ids": [ + 5 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: retrieval augmented generation\nType: TECHNOLOGY" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "hierarchical structure aware index based approach", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is defined as a hierarchical structure aware index based approach", + "source_ids": [ + 5 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: hierarchical structure aware index based approach\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "complex documents", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is designed for processing complex documents", + "source_ids": [ + 5 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: complex documents\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 1", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "figure 1 displays a comparison involving bookrag", + "source_ids": [ + 12 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: figure 1\nType: IMAGE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "complex document qa", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is a solution or method applied to the task of complex document qa", + "source_ids": [ + 12 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: complex document qa\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "table 1", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "table 1 contains the comparison data for bookrag", + "source_ids": [ + 16 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: table 1\nType: TABLE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "representative methods", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is being compared to representative methods in the text", + "source_ids": [ + 16 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: representative methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes a high quality kg as a key feature contributing to its performance", + "source_ids": [ + 23 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: kg\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based retrieval mechanism", + "relation_name": "", + "weight": 9.0, + "description": "bookrag employs an agent based retrieval mechanism as a key feature contributing to its performance", + "source_ids": [ + 23 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: agent based retrieval mechanism\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "three widely adopted datasets", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is extensively experimented upon using three widely adopted datasets to validate its effectiveness", + "source_ids": [ + 23 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: three widely adopted datasets\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "state of the art baselines", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is being compared against state of the art baselines to analyze its performance", + "source_ids": [ + 151 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: state of the art baselines\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "existing baselines", + "relation_name": "", + "weight": 9.0, + "description": "bookrag demonstrates significant superiority over existing baselines", + "source_ids": [ + 188 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: existing baselines\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 10.0, + "description": "bookrag attains state of the art performance in solving complex document qa tasks", + "source_ids": [ + 27 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: complex document qa tasks\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "extensive experiments", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "extensive experiments were performed on bookrag to demonstrate its capabilities", + "source_ids": [ + 27 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: extensive experiments\nType: EVENT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "state of the art performance", + "relation_name": "", + "weight": 10.0, + "description": "bookrag attained state of the art performance as a result of the experiments", + "source_ids": [ + 27 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: state of the art performance\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "competitive efficiency", + "relation_name": "", + "weight": 8.0, + "description": "bookrag maintained competitive efficiency while solving tasks", + "source_ids": [ + 27 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: competitive efficiency\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "section 5", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "section 5 elaborates on the execution of bookrag", + "source_ids": [ + 29 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: section 5\nType: SECTION_TITLE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "structured execution", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is the system undergoing structured execution described in section 5", + "source_ids": [ + 29 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: structured execution\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "query classification", + "relation_name": "", + "weight": 8.0, + "description": "bookrag utilizes query classification in its execution", + "source_ids": [ + 29 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: query classification\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "operators", + "relation_name": "", + "weight": 8.0, + "description": "bookrag uses operators in its structured execution", + "source_ids": [ + 29 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: operators\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is designed to intelligently navigate the bookindex", + "source_ids": [ + 88 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: bookindex\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "formulator", + "relation_name": "", + "weight": 9.0, + "description": "bookrag defines the formulator as one of its four types of operators", + "source_ids": [ + 88 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: formulator\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 9.0, + "description": "bookrag defines the selector as one of its four types of operators", + "source_ids": [ + 88 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: selector\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 9.0, + "description": "bookrag defines the reasoner as one of its four types of operators", + "source_ids": [ + 88 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: reasoner\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "synthesizer", + "relation_name": "", + "weight": 9.0, + "description": "bookrag defines the synthesizer as one of its four types of operators", + "source_ids": [ + 88 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: synthesizer\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent", + "relation_name": "", + "weight": 8.0, + "description": "the agent performs the first step of the process within bookrag", + "source_ids": [ + 88 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: agent\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "query categories", + "relation_name": "", + "weight": 9.0, + "description": "bookrag dynamically configures operators to adapt to the specific requirements of different query categories", + "source_ids": [ + 88 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: query categories\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "table 2", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "table 2 details query categories that are addressed within the bookrag system", + "source_ids": [ + 89 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: table 2\nType: TABLE" + }, + { + "src_entity_name": "table 3", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "table 3 details the operators used within the bookrag system", + "source_ids": [ + 131 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: table 3\nType: TABLE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "document qa tasks", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is evaluated for its efficiency and accuracy specifically on document qa tasks", + "source_ids": [ + 137 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: document qa tasks\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "baseline methods", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is compared against several strong baseline methods in the experiments", + "source_ids": [ + 137 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: baseline methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 8.0, + "description": "the accuracy of bookrag is evaluated and compared in the experiments", + "source_ids": [ + 137 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: accuracy\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "qwen family", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is powered by models from the qwen family", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: qwen family\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "mineru", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes mineru for robust document layout parsing", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: mineru\nType: SOFTWARE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "github com sam234990 bookrag", + "relation_name": "", + "weight": 10.0, + "description": "the source code and configurations for bookrag are available at the specified github location", + "source_ids": [ + 149 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: github com sam234990 bookrag\nType: LOCATION" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "gradient g", + "relation_name": "", + "weight": 8.0, + "description": "bookrag s implementation sets the threshold of gradient g as 0 6", + "source_ids": [ + 149 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: gradient g\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "baseline methods", + "relation_name": "", + "weight": 9.0, + "description": "bookrag and baseline methods are compared fairly using the same backbone models", + "source_ids": [ + 149 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: baseline methods\nType: UNKNOWN" + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "qa is the task performed by the different variants of bookrag being compared", + "source_ids": [ + 170 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: qa\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 9.0, + "description": "bookrag ensures precise evidence retrieval by overcoming limitations of existing baselines", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: retrieval\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "query efficiency", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is being analyzed for its query efficiency", + "source_ids": [ + 151 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: query efficiency\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "evaluation", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "the evaluation is the process being conducted on bookrag", + "source_ids": [ + 151 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: evaluation\nType: EVENT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "table 5", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s qa performance is presented and compared in table 5", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: table 5\nType: TABLE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "m3docvqa", + "relation_name": "", + "weight": 10.0, + "description": "bookrag achieves a 71 2 recall on the m3docvqa dataset", + "source_ids": [ + 157 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: m3docvqa\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "18 0", + "relation_name": "", + "weight": 10.0, + "description": "bookrag outperforms the top baseline by 18 0 on the m3docvqa dataset", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: 18 0\nType: PERCENTAGE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "tree graph bookindex", + "relation_name": "", + "weight": 10.0, + "description": "bookrag s superiority stems from the synergy of its unified tree graph bookindex", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: tree graph bookindex\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 10.0, + "description": "bookrag s superiority stems from the synergy of its agent based planning", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: agent based planning\nType: PRODUCT" + }, + { + "src_entity_name": "qa performance", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "qa performance is the metric used to evaluate bookrag s capabilities", + "source_ids": [ + 179 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: qa performance\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "exact match", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s performance is measured using the exact match metric on m3docvqa", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: exact match\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 9.0, + "description": "bookrag ensures accurate generation by overcoming limitations of existing baselines", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: generation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "queries", + "relation_name": "", + "weight": 9.0, + "description": "bookrag effectively classifies queries to configure optimal workflows", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: queries\nType: CONCEPT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "workflows", + "relation_name": "", + "weight": 9.0, + "description": "bookrag configures optimal workflows to improve retrieval and generation", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: workflows\nType: CONCEPT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "context fragmentation", + "relation_name": "", + "weight": 9.0, + "description": "bookrag overcomes the limitation of context fragmentation found in existing baselines", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: context fragmentation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "static query workflow", + "relation_name": "", + "weight": 9.0, + "description": "bookrag overcomes the limitation of static query workflow found in existing baselines", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: static query workflow\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "top performing baseline", + "relation_name": "", + "weight": 10.0, + "description": "bookrag substantially outperforms the top performing baseline by 18 0", + "source_ids": [ + 152 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: top performing baseline\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "graphranker", + "relation_name": "", + "weight": 9.0, + "description": "bookrag significantly outperforms graphranker in retrieval recall", + "source_ids": [ + 157 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: graphranker\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "ift inspired selector reasoner workflow", + "relation_name": "", + "weight": 10.0, + "description": "the performance advantage of bookrag stems from its ift inspired selector reasoner workflow", + "source_ids": [ + 157 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: ift inspired selector reasoner workflow\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval performance", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s retrieval performance is the subject of the validation described in the text", + "source_ids": [ + 157 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: retrieval performance\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "ground truth layout blocks", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is evaluated against ground truth layout blocks to validate its design", + "source_ids": [ + 157 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: ground truth layout blocks\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "layout based baselines", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is evaluated against layout based baselines to demonstrate its superiority", + "source_ids": [ + 157 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: layout based baselines\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "graph based rag methods", + "relation_name": "", + "weight": 9.0, + "description": "bookrag maintains time and token costs comparable to existing graph based rag methods", + "source_ids": [ + 160 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: graph based rag methods\nType: TECHNOLOGY" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "text based rag approaches", + "relation_name": "", + "weight": 7.0, + "description": "bookrag maintains a balanced efficiency among multi modal methods compared to text based approaches which have lower latency", + "source_ids": [ + 160 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: text based rag approaches\nType: TECHNOLOGY" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "vlm", + "relation_name": "", + "weight": 8.0, + "description": "bookrag involves vlm processing for images unlike purely text based rag approaches", + "source_ids": [ + 160 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: vlm\nType: TECHNOLOGY" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "docetl", + "relation_name": "", + "weight": 10.0, + "description": "bookrag reduces token consumption by an order of magnitude and achieves a speedup of up to 2x compared to docetl", + "source_ids": [ + 160 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: docetl\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "bookrag requires less than 5 million tokens on the mmlongbench dataset", + "source_ids": [ + 160 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: mmlongbench\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "figure 5", + "relation_name": "", + "weight": 8.0, + "description": "figure 5 illustrates the efficiency evaluation of bookrag", + "source_ids": [ + 160 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: figure 5\nType: IMAGE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "5 million", + "relation_name": "", + "weight": 10.0, + "description": "bookrag requires less than 5 million tokens on the mmlongbench dataset", + "source_ids": [ + 160 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: 5 million\nType: MEASUREMENT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "2", + "relation_name": "", + "weight": 10.0, + "description": "bookrag achieves a speedup of up to 2 compared to docetl", + "source_ids": [ + 160 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: 2\nType: MEASUREMENT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "order of magnitude", + "relation_name": "", + "weight": 9.0, + "description": "bookrag reduces token consumption by an order of magnitude compared to docetl", + "source_ids": [ + 160 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: order of magnitude\nType: MEASUREMENT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "ablation study", + "relation_name": "", + "weight": 9.0, + "description": "an ablation study is conducted on bookrag to validate its components", + "source_ids": [ + 163 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: ablation study\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "gradient based er", + "relation_name": "", + "weight": 8.0, + "description": "experiments on bookrag involve analyzing the impact of gradient based er", + "source_ids": [ + 163 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: gradient based er\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "entity resolution method", + "relation_name": "", + "weight": 8.0, + "description": "the effectiveness of the entity resolution method is compared in the context of bookrag", + "source_ids": [ + 163 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: entity resolution method\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "case study", + "relation_name": "", + "weight": 7.0, + "description": "a case study is presented as part of the examination of bookrag", + "source_ids": [ + 163 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: case study\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 8.0, + "description": "experiments on bookrag are conducted across different query types", + "source_ids": [ + 163 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: query types\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "error analysis", + "relation_name": "", + "weight": 9.0, + "description": "a comprehensive error analysis is performed as part of the examination of bookrag", + "source_ids": [ + 163 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: error analysis\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "ablation study", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "the ablation study is conducted to evaluate the core components of bookrag", + "source_ids": [ + 164 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: ablation study\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "table 7", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 7.0, + "description": "table 7 presents data regarding the performance of the bookrag system variants", + "source_ids": [ + 172 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: table 7\nType: TABLE" + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "the kg is a critical component within the bookrag system supporting effective reasoning", + "source_ids": [ + 172 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: kg\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "agent based planning", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 8.0, + "description": "agent based planning is a mechanism assessed for its necessity within the bookrag system", + "source_ids": [ + 172 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: agent based planning\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "ift inspired selection mechanism", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 8.0, + "description": "the ift inspired selection mechanism is a strategy evaluated for its efficiency in the bookrag system", + "source_ids": [ + 172 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: ift inspired selection mechanism\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "multi dimensional reasoning", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 8.0, + "description": "multi dimensional reasoning is a strategy validated for its effectiveness in the bookrag system", + "source_ids": [ + 172 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: multi dimensional reasoning\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "dynamic skyline filtering strategy", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 8.0, + "description": "the dynamic skyline filtering strategy is a method validated for its effectiveness in the bookrag system", + "source_ids": [ + 172 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: dynamic skyline filtering strategy\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "planning mechanism", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "the planning mechanism is a component of bookrag whose removal causes significant performance loss", + "source_ids": [ + 172 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: planning mechanism\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "figure 7 displays the performance breakdown of bookrag", + "source_ids": [ + 179 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: figure 7\nType: IMAGE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 9.0, + "description": "bookrag processes single hop queries reducing the reasoning space significantly", + "source_ids": [ + 186 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: single hop\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "multihop", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s performance is evaluated against multihop queries which present a greater challenge", + "source_ids": [ + 179 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: multihop\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "global aggregation", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s performance is evaluated against global aggregation queries", + "source_ids": [ + 179 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: global aggregation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "figure 9", + "relation_name": "", + "weight": 9.0, + "description": "figure 9 illustrates the error propagation traced while diagnosing the performance bottlenecks of bookrag", + "source_ids": [ + 180 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: figure 9\nType: IMAGE" + }, + { + "src_entity_name": "error response analysis", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "error response analysis is performed on bookrag to diagnose its performance bottlenecks", + "source_ids": [ + 180 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: error response analysis\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "figure 8", + "relation_name": "", + "weight": 10.0, + "description": "figure 8 illustrates the answering workflow of bookrag", + "source_ids": [ + 186 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: figure 8\nType: IMAGE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "multi hop", + "relation_name": "", + "weight": 9.0, + "description": "bookrag processes multi hop queries as part of its answering workflow", + "source_ids": [ + 186 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: multi hop\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "global queries", + "relation_name": "", + "weight": 9.0, + "description": "bookrag processes global queries as part of its answering workflow", + "source_ids": [ + 186 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: global queries\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "select", + "relation_name": "", + "weight": 10.0, + "description": "bookrag leverages the select operator to prune search spaces", + "source_ids": [ + 186 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: select\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "decompose", + "relation_name": "", + "weight": 10.0, + "description": "bookrag leverages the decompose operator to prune search spaces", + "source_ids": [ + 186 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: decompose\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "filter", + "relation_name": "", + "weight": 10.0, + "description": "bookrag leverages the filter operator to prune search spaces", + "source_ids": [ + 186 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: filter\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "book index", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is built upon book index", + "source_ids": [ + 188 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: book index\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based method", + "relation_name": "", + "weight": 9.0, + "description": "bookrag employs an agent based method to configure operators", + "source_ids": [ + 188 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: agent based method\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "benchmarks", + "relation_name": "", + "weight": 9.0, + "description": "bookrag achieves state of the art performance on multiple benchmarks", + "source_ids": [ + 188 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: benchmarks\nType: BENCHMARK" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval precision", + "relation_name": "", + "weight": 8.0, + "description": "bookrag demonstrates significant superiority in retrieval precision over existing baselines", + "source_ids": [ + 188 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: retrieval precision\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "answer accuracy", + "relation_name": "", + "weight": 8.0, + "description": "bookrag demonstrates significant superiority in answer accuracy over existing baselines", + "source_ids": [ + 188 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: answer accuracy\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "paper", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is proposed within the paper", + "source_ids": [ + 188 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: paper\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "python", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is implemented in python", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: python\nType: PROGRAMMING_LANGUAGE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "linux", + "relation_name": "", + "weight": 8.0, + "description": "experiments for bookrag were conducted on a linux operating system", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: linux\nType: SOFTWARE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "500 tokens", + "relation_name": "", + "weight": 8.0, + "description": "bookrag standardizes the chunk size at 500 tokens for document chunking", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: 500 tokens\nType: MEASUREMENT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "10", + "relation_name": "", + "weight": 8.0, + "description": "bookrag sets the retrieval top k to 10 for consistent candidate pool sizes", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: 10\nType: MEASUREMENT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "10b parameter scale", + "relation_name": "", + "weight": 9.0, + "description": "bookrag primarily selects models under the 10b parameter scale", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: 10b parameter scale\nType: MEASUREMENT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "30b version", + "relation_name": "", + "weight": 9.0, + "description": "bookrag adopts the 30b version of the vlm due to performance issues with the 8b counterpart", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: 30b version\nType: MEASUREMENT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "8b counterpart", + "relation_name": "", + "weight": 8.0, + "description": "the 8b counterpart of the vlm exhibited significant performance deficits leading to the adoption of the 30b version", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: 8b counterpart\nType: MEASUREMENT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "https github com sam234990 bookrag", + "relation_name": "", + "weight": 10.0, + "description": "the source code and configurations for bookrag are available at the specified github url", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: https github com sam234990 bookrag\nType: LOCATION" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "baseline methods", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is compared against baseline methods to ensure a fair comparison", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: baseline methods\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "document chunking", + "relation_name": "", + "weight": 8.0, + "description": "bookrag involves document chunking as part of its processing pipeline", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: document chunking\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval ranking", + "relation_name": "", + "weight": 8.0, + "description": "bookrag involves retrieval ranking as part of its processing pipeline", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: retrieval ranking\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "sequential processing mode", + "relation_name": "", + "weight": 9.0, + "description": "bookrag methods were executed in sequential processing mode to ensure fair efficiency comparison", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: sequential processing mode\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "candidate pool", + "relation_name": "", + "weight": 8.0, + "description": "bookrag standardizes the candidate pool size across baselines", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: candidate pool\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "efficiency", + "relation_name": "", + "weight": 9.0, + "description": "bookrag balances efficiency and effectiveness in model selection", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: efficiency\nType: CONCEPT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "effectiveness", + "relation_name": "", + "weight": 9.0, + "description": "bookrag balances efficiency and effectiveness in model selection", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: effectiveness\nType: CONCEPT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "reproducibility", + "relation_name": "", + "weight": 9.0, + "description": "bookrag aims for reproducibility by making code and configs public", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: reproducibility\nType: CONCEPT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "fair comparison", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is designed to enable a fair comparison with other methods", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: fair comparison\nType: CONCEPT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "source code", + "relation_name": "", + "weight": 10.0, + "description": "the source code for bookrag is available at the repository", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: source code\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "implementation configurations", + "relation_name": "", + "weight": 10.0, + "description": "the implementation configurations for bookrag are available at the repository", + "source_ids": [ + 238 + ], + "source": "Name: bookrag\nType: PRODUCT", + "target": "Name: implementation configurations\nType: PRODUCT" + }, + { + "src_entity_name": "qwen2 5 vl technical report", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 9.0, + "description": "the qwen2 5 vl technical report was published in the year 2025", + "source_ids": [ + 194 + ], + "source": "Name: 2025\nType: DATE", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper was published in the year 2025", + "source_ids": [ + 195 + ], + "source": "Name: 2025\nType: DATE", + "target": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK" + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 9.0, + "description": "lego graphrag was published in the year 2025", + "source_ids": [ + 196 + ], + "source": "Name: 2025\nType: DATE", + "target": "Name: lego graphrag\nType: PRODUCT" + }, + { + "src_entity_name": "proc vldb endow", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 9.0, + "description": "proc vldb endow published the paper in 2025", + "source_ids": [ + 196 + ], + "source": "Name: 2025\nType: DATE", + "target": "Name: proc vldb endow\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 8.0, + "description": "gemini 2 5 is the subject of a paper published in 2025", + "source_ids": [ + 203 + ], + "source": "Name: 2025\nType: DATE", + "target": "Name: gemini 2 5\nType: PRODUCT" + }, + { + "src_entity_name": "l", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 8.0, + "description": "the parameter l uses 1 to represent the root level", + "source_ids": [ + 57 + ], + "source": "Name: 1\nType: MEASUREMENT", + "target": "Name: l\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "large language models", + "relation_name": "", + "weight": 10.0, + "description": "retrieval augmented generation is applied to large language models as described in the survey", + "source_ids": [ + 207 + ], + "source": "Name: retrieval augmented generation\nType: TECHNOLOGY", + "target": "Name: large language models\nType: TECHNOLOGY" + }, + { + "src_entity_name": "retrieval augmented generation for large language models a survey", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 9.0, + "description": "the survey is about the technology of retrieval augmented generation", + "source_ids": [ + 207 + ], + "source": "Name: retrieval augmented generation\nType: TECHNOLOGY", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 10.0, + "description": "lightrag is a type of retrieval augmented generation system", + "source_ids": [ + 208 + ], + "source": "Name: retrieval augmented generation\nType: TECHNOLOGY", + "target": "Name: lightrag\nType: PRODUCT" + }, + { + "src_entity_name": "zirui guo", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 7.0, + "description": "zirui guo is an author of a paper about retrieval augmented generation", + "source_ids": [ + 208 + ], + "source": "Name: retrieval augmented generation\nType: TECHNOLOGY", + "target": "Name: zirui guo\nType: PERSON" + }, + { + "src_entity_name": "lianghao xia", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 7.0, + "description": "lianghao xia is an author of a paper about retrieval augmented generation", + "source_ids": [ + 208 + ], + "source": "Name: retrieval augmented generation\nType: TECHNOLOGY", + "target": "Name: lianghao xia\nType: PERSON" + }, + { + "src_entity_name": "yanhua yu", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 7.0, + "description": "yanhua yu is an author of a paper about retrieval augmented generation", + "source_ids": [ + 208 + ], + "source": "Name: retrieval augmented generation\nType: TECHNOLOGY", + "target": "Name: yanhua yu\nType: PERSON" + }, + { + "src_entity_name": "tu ao", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 7.0, + "description": "tu ao is an author of a paper about retrieval augmented generation", + "source_ids": [ + 208 + ], + "source": "Name: retrieval augmented generation\nType: TECHNOLOGY", + "target": "Name: tu ao\nType: PERSON" + }, + { + "src_entity_name": "chao huang", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 7.0, + "description": "chao huang is an author of a paper about retrieval augmented generation", + "source_ids": [ + 208 + ], + "source": "Name: retrieval augmented generation\nType: TECHNOLOGY", + "target": "Name: chao huang\nType: PERSON" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "github", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is hosted on the github platform as indicated by the provided url", + "source_ids": [ + 7 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: github\nType: ORGANIZATION" + }, + { + "src_entity_name": "sam234990", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "sam234990 is the creator or owner of the bookrag repository", + "source_ids": [ + 7 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: sam234990\nType: PERSON" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "source code", + "relation_name": "", + "weight": 10.0, + "description": "bookrag contains the source code that has been made available", + "source_ids": [ + 7 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: source code\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "data", + "relation_name": "", + "weight": 10.0, + "description": "bookrag contains the data that has been made available", + "source_ids": [ + 7 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: data\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "artifacts", + "relation_name": "", + "weight": 10.0, + "description": "bookrag includes other artifacts that have been made available", + "source_ids": [ + 7 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: artifacts\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "bookrag executes operations on the bookindex to handle document queries", + "source_ids": [ + 79 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: bookindex\nType: DATABASE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes agent based planning as one of its two core mechanisms to formulate strategies", + "source_ids": [ + 79 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: agent based planning\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "structured execution", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes structured execution as one of its two core mechanisms to handle retrieval and generation", + "source_ids": [ + 79 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: structured execution\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 10.0, + "description": "bookrag performs the agent based planning stage as its first step", + "source_ids": [ + 82 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: agent based planning\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 3", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "figure 3 depicts the workflow of bookrag", + "source_ids": [ + 83 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: figure 3\nType: IMAGE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "bookrag contains the agent based retrieval workflow", + "source_ids": [ + 83 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: agent based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "workflow", + "relation_name": "", + "weight": 10.0, + "description": "bookrag executes the generated workflow p", + "source_ids": [ + 124 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: workflow\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "bookrag gets the retrieval set from the bookindex", + "source_ids": [ + 85 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: bookindex\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "information blocks", + "relation_name": "", + "weight": 10.0, + "description": "bookrag obtains the retrieval set of highly relevant information blocks", + "source_ids": [ + 85 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: information blocks\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "query classification", + "relation_name": "", + "weight": 7.0, + "description": "bookrag is designed to resolve a broader range of query types including those defined by the classification", + "source_ids": [ + 96 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: query classification\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "additional operators", + "relation_name": "", + "weight": 8.0, + "description": "bookrag resolves broader query types by integrating additional operators", + "source_ids": [ + 96 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: additional operators\nType: SOFTWARE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes the selector operator to navigate to information patches", + "source_ids": [ + 124 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: selector\nType: SOFTWARE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes the reasoner operator to perform sensemaking within information patches", + "source_ids": [ + 124 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: reasoner\nType: SOFTWARE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "synthesizer", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes the synthesizer operator to generate the final answer", + "source_ids": [ + 124 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: synthesizer\nType: SOFTWARE" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "p", + "relation_name": "", + "weight": 10.0, + "description": "p is the specific workflow executed by bookrag", + "source_ids": [ + 124 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: p\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "abstract textual queries", + "relation_name": "", + "weight": 9.0, + "description": "bookrag translates abstract textual queries into concrete operations", + "source_ids": [ + 124 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: abstract textual queries\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "concrete operations", + "relation_name": "", + "weight": 9.0, + "description": "bookrag produces concrete operations from abstract queries", + "source_ids": [ + 124 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: concrete operations\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "cost of attention", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s design minimizes the cost of attention", + "source_ids": [ + 124 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: cost of attention\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "computational resources", + "relation_name": "", + "weight": 9.0, + "description": "bookrag ensures computational resources are focused on high value data", + "source_ids": [ + 124 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: computational resources\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "high value data patches", + "relation_name": "", + "weight": 9.0, + "description": "bookrag focuses computational resources solely on high value data patches", + "source_ids": [ + 124 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: high value data patches\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 8", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "figure 8 highlights content generated by bookrag", + "source_ids": [ + 181 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: figure 8\nType: IMAGE" + }, + { + "src_entity_name": "cyan text", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "cyan text highlights the content generated by bookrag", + "source_ids": [ + 181 + ], + "source": "Name: bookrag\nType: SOFTWARE", + "target": "Name: cyan text\nType: COLOR" + }, + { + "src_entity_name": "source code", + "tgt_entity_name": "github", + "relation_name": "", + "weight": 9.0, + "description": "the source code is hosted on github", + "source_ids": [ + 7 + ], + "source": "Name: github\nType: ORGANIZATION", + "target": "Name: source code\nType: PRODUCT" + }, + { + "src_entity_name": "data", + "tgt_entity_name": "github", + "relation_name": "", + "weight": 9.0, + "description": "the data is hosted on github", + "source_ids": [ + 7 + ], + "source": "Name: github\nType: ORGANIZATION", + "target": "Name: data\nType: PRODUCT" + }, + { + "src_entity_name": "artifacts", + "tgt_entity_name": "github", + "relation_name": "", + "weight": 9.0, + "description": "the artifacts are hosted on github", + "source_ids": [ + 7 + ], + "source": "Name: github\nType: ORGANIZATION", + "target": "Name: artifacts\nType: PRODUCT" + }, + { + "src_entity_name": "sam234990", + "tgt_entity_name": "https github com sam234990 bookrag", + "relation_name": "", + "weight": 10.0, + "description": "sam234990 is the owner of the github repository url", + "source_ids": [ + 238 + ], + "source": "Name: sam234990\nType: PERSON", + "target": "Name: https github com sam234990 bookrag\nType: LOCATION" + }, + { + "src_entity_name": "large language models", + "tgt_entity_name": "qwen 3", + "relation_name": "", + "weight": 10.0, + "description": "qwen 3 is identified as an example of a large language model", + "source_ids": [ + 9 + ], + "source": "Name: large language models\nType: TECHNOLOGY", + "target": "Name: qwen 3\nType: PRODUCT" + }, + { + "src_entity_name": "large language models", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 10.0, + "description": "gemini 2 5 is identified as an example of a large language model", + "source_ids": [ + 9 + ], + "source": "Name: large language models\nType: TECHNOLOGY", + "target": "Name: gemini 2 5\nType: PRODUCT" + }, + { + "src_entity_name": "large language models", + "tgt_entity_name": "qa system", + "relation_name": "", + "weight": 9.0, + "description": "large language models are used to build qa systems", + "source_ids": [ + 9 + ], + "source": "Name: large language models\nType: TECHNOLOGY", + "target": "Name: qa system\nType: PRODUCT" + }, + { + "src_entity_name": "retrieval augmented generation for large language models a survey", + "tgt_entity_name": "large language models", + "relation_name": "", + "weight": 9.0, + "description": "the survey covers the topic of large language models", + "source_ids": [ + 207 + ], + "source": "Name: large language models\nType: TECHNOLOGY", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "gheorghe comanici is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: gheorghe comanici\nType: PERSON" + }, + { + "src_entity_name": "eric bieber", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "eric bieber is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: eric bieber\nType: PERSON" + }, + { + "src_entity_name": "mike schaekermann", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "mike schaekermann is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: mike schaekermann\nType: PERSON" + }, + { + "src_entity_name": "ice pasupat", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "ice pasupat is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: ice pasupat\nType: PERSON" + }, + { + "src_entity_name": "noveen sachdeva", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "noveen sachdeva is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: noveen sachdeva\nType: PERSON" + }, + { + "src_entity_name": "inderjit dhillon", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "inderjit dhillon is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: inderjit dhillon\nType: PERSON" + }, + { + "src_entity_name": "marcel blistein", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "marcel blistein is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: marcel blistein\nType: PERSON" + }, + { + "src_entity_name": "ori ram", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "ori ram is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: ori ram\nType: PERSON" + }, + { + "src_entity_name": "dan zhang", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "dan zhang is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: dan zhang\nType: PERSON" + }, + { + "src_entity_name": "evan rosen", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "evan rosen is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: evan rosen\nType: PERSON" + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the paper describing gemini 2 5 is published on arxiv", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: arxiv\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "advanced reasoning", + "relation_name": "", + "weight": 10.0, + "description": "gemini 2 5 is described as having advanced reasoning capabilities", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: advanced reasoning\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "multimodality", + "relation_name": "", + "weight": 10.0, + "description": "gemini 2 5 is described as having multimodality capabilities", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: multimodality\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "long context", + "relation_name": "", + "weight": 10.0, + "description": "gemini 2 5 is described as having long context capabilities", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: long context\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "next generation agentic capabilities", + "relation_name": "", + "weight": 10.0, + "description": "gemini 2 5 is described as having next generation agentic capabilities", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: next generation agentic capabilities\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gemini 2 5 pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 10.0, + "description": "the title refers to the product gemini 2 5", + "source_ids": [ + 203 + ], + "source": "Name: gemini 2 5\nType: PRODUCT", + "target": "Name: gemini 2 5 pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities\nType: BOOK" + }, + { + "src_entity_name": "qa system", + "tgt_entity_name": "users", + "relation_name": "", + "weight": 8.0, + "description": "qa systems are designed to assist users", + "source_ids": [ + 9 + ], + "source": "Name: qa system\nType: PRODUCT", + "target": "Name: users\nType: PERSON" + }, + { + "src_entity_name": "creative commons by nc nd 4 0 international license", + "tgt_entity_name": "vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "the work licensed under the creative commons by nc nd 4 0 international license has its publication rights licensed to the vldb endowment", + "source_ids": [ + 10 + ], + "source": "Name: creative commons by nc nd 4 0 international license\nType: LAW", + "target": "Name: vldb endowment\nType: ORGANIZATION" + }, + { + "src_entity_name": "creative commons", + "tgt_entity_name": "creative commons by nc nd 4 0 international license", + "relation_name": "", + "weight": 9.0, + "description": "creative commons is the creator of the by nc nd 4 0 international license", + "source_ids": [ + 10 + ], + "source": "Name: creative commons by nc nd 4 0 international license\nType: LAW", + "target": "Name: creative commons\nType: ORGANIZATION" + }, + { + "src_entity_name": "owner author s", + "tgt_entity_name": "creative commons by nc nd 4 0 international license", + "relation_name": "", + "weight": 8.0, + "description": "the owner author s hold the copyright for the work which is licensed under the creative commons by nc nd 4 0 international license", + "source_ids": [ + 10 + ], + "source": "Name: creative commons by nc nd 4 0 international license\nType: LAW", + "target": "Name: owner author s\nType: PERSON" + }, + { + "src_entity_name": "owner author s", + "tgt_entity_name": "vldb endowment", + "relation_name": "", + "weight": 7.0, + "description": "the owner author s hold the copyright while the vldb endowment is licensed the publication rights", + "source_ids": [ + 10 + ], + "source": "Name: vldb endowment\nType: ORGANIZATION", + "target": "Name: owner author s\nType: PERSON" + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "vldb endowment", + "relation_name": "", + "weight": 9.0, + "description": "proceedings of the vldb endowment is published by the vldb endowment organization", + "source_ids": [ + 191 + ], + "source": "Name: vldb endowment\nType: ORGANIZATION", + "target": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "vol 19", + "relation_name": "", + "weight": 9.0, + "description": "vol 19 is the volume associated with the proceedings of the vldb endowment", + "source_ids": [ + 11 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: vol 19\nType: MEASUREMENT" + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "no 1", + "relation_name": "", + "weight": 9.0, + "description": "no 1 is the issue number associated with the proceedings of the vldb endowment", + "source_ids": [ + 11 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: no 1\nType: MEASUREMENT" + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "issn 2150 8097", + "relation_name": "", + "weight": 10.0, + "description": "issn 2150 8097 is the identifier for the proceedings of the vldb endowment", + "source_ids": [ + 11 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: issn 2150 8097\nType: MEASUREMENT" + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "doi xx xx xxx xx", + "relation_name": "", + "weight": 9.0, + "description": "doi xx xx xxx xx is the identifier for the specific article within the proceedings of the vldb endowment", + "source_ids": [ + 11 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: doi xx xx xxx xx\nType: MEASUREMENT" + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "simran arora is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 191 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: simran arora\nType: PERSON" + }, + { + "src_entity_name": "2023", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "the proceedings of the vldb endowment volume 17 issue 2 was published in 2023", + "source_ids": [ + 191 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: 2023\nType: DATE" + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "chengliang chai is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: chengliang chai\nType: PERSON" + }, + { + "src_entity_name": "jiajun li", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "jiajun li is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: jiajun li\nType: PERSON" + }, + { + "src_entity_name": "yuhao deng", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "yuhao deng is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: yuhao deng\nType: PERSON" + }, + { + "src_entity_name": "yuanhao zhong", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "yuanhao zhong is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: yuanhao zhong\nType: PERSON" + }, + { + "src_entity_name": "ye yuan", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "ye yuan is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: ye yuan\nType: PERSON" + }, + { + "src_entity_name": "guoren wang", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "guoren wang is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: guoren wang\nType: PERSON" + }, + { + "src_entity_name": "lei cao", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "lei cao is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: lei cao\nType: PERSON" + }, + { + "src_entity_name": "doctopus", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 10.0, + "description": "the doctopus paper was published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: doctopus\nType: PRODUCT" + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "18", + "relation_name": "", + "weight": 9.0, + "description": "the proceedings of the vldb endowment volume 18 contains the paper", + "source_ids": [ + 197 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: 18\nType: MEASUREMENT" + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "11", + "relation_name": "", + "weight": 9.0, + "description": "the proceedings of the vldb endowment issue 11 contains the paper", + "source_ids": [ + 197 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: 11\nType: MEASUREMENT" + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "3695 3707", + "relation_name": "", + "weight": 9.0, + "description": "the paper appears on pages 3695 3707 of the proceedings of the vldb endowment", + "source_ids": [ + 197 + ], + "source": "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "target": "Name: 3695 3707\nType: MEASUREMENT" + }, + { + "src_entity_name": "figure 1", + "tgt_entity_name": "existing methods", + "relation_name": "", + "weight": 9.0, + "description": "figure 1 displays a comparison involving existing methods", + "source_ids": [ + 12 + ], + "source": "Name: figure 1\nType: IMAGE", + "target": "Name: existing methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "figure 1", + "tgt_entity_name": "rag", + "relation_name": "", + "weight": 8.0, + "description": "figure 1 illustrates the existing rag approaches for document level qa", + "source_ids": [ + 15 + ], + "source": "Name: figure 1\nType: IMAGE", + "target": "Name: rag\nType: TECHNOLOGY" + }, + { + "src_entity_name": "existing methods", + "tgt_entity_name": "complex document qa", + "relation_name": "", + "weight": 8.0, + "description": "existing methods are techniques used for the task of complex document qa", + "source_ids": [ + 12 + ], + "source": "Name: existing methods\nType: METHOD_OR_TECHNIQUE", + "target": "Name: complex document qa\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "rag systems", + "tgt_entity_name": "complex document qa", + "relation_name": "", + "weight": 6.0, + "description": "the text mentions reviewing the workflow of rag systems in the context of formalizing the research problem of complex document qa", + "source_ids": [ + 35 + ], + "source": "Name: complex document qa\nType: TASK_OR_PROBLEM", + "target": "Name: rag systems\nType: TECHNOLOGY" + }, + { + "src_entity_name": "complex document qa", + "tgt_entity_name": "3.1 problem formulation", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Complex Document QA' is the primary topic and subject of the problem formulation detailed in section 3.1.", + "source_ids": [ + 36 + ], + "source": "Name: complex document qa\nType: TASK_OR_PROBLEM", + "target": "Name: 3.1 problem formulation\nType: SECTION_TITLE" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "complex query", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Complex Query", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: complex query\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "complex multi-page document", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Complex Multi-page Document", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: complex multi-page document\nType: PRODUCT" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "text-only rag", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Text-Only RAG", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: text-only rag\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "plain text extraction (ocr)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Plain Text Extraction (OCR)", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: plain text extraction (ocr)\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "unstructured chunks", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Unstructured Chunks", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: unstructured chunks\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "text index (vector/graph/tree)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Text Index (Vector/Graph/Tree)", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: text index (vector/graph/tree)\nType: SYSTEM_COMPONENT" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "fixed/ graph retrieval", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Fixed/ Graph Retrieval", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: fixed/ graph retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to LLM", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: llm\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "fails on structural dependencies", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Fails on Structural dependencies", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: fails on structural dependencies\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "layout segmented rag", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Layout Segmented RAG", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: layout segmented rag\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "layout analysis & parsing", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Layout Analysis & Parsing", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: layout analysis & parsing\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "flattened chunks", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Flattened Chunks", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: flattened chunks\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "flattened vector index", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Flattened Vector Index", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: flattened vector index\nType: SYSTEM_COMPONENT" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "fixed retrieval", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Fixed Retrieval", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: fixed retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "loses complex relationships", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Loses complex relationships", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: loses complex relationships\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "bookrag (natively structure-aware)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to BookRAG (Natively Structure-aware)", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: bookrag (natively structure-aware)\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "hierarchical chunks", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Hierarchical Chunks", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: hierarchical chunks\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to BookIndex", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: bookindex\nType: SYSTEM_COMPONENT" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "agent-based retrieval", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Agent-based Retrieval", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: agent-based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "accurate, structured-grounded", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Accurate, structured-grounded", + "source_ids": [ + 13 + ], + "source": "Name: cref='#/texts/14'\nType: IMAGE", + "target": "Name: accurate, structured-grounded\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "decompose", + "tgt_entity_name": "complex query", + "relation_name": "", + "weight": 10.0, + "description": "decompose takes a complex query as its input to break it down", + "source_ids": [ + 98 + ], + "source": "Name: complex query\nType: TASK_OR_PROBLEM", + "target": "Name: decompose\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 10.0, + "description": "docetl uses llm powered operations to create processing pipelines", + "source_ids": [ + 18 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: docetl\nType: SOFTWARE" + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 10.0, + "description": "section filtering utilizes an llm to analyze content and layout features of candidates", + "source_ids": [ + 57 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: section filtering\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "title", + "relation_name": "", + "weight": 9.0, + "description": "llm analyzes title candidates to determine their actual hierarchical level and final node type", + "source_ids": [ + 57 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: title\nType: SECTION_TITLE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "text", + "relation_name": "", + "weight": 8.0, + "description": "llm may re classify erroneous title blocks as text", + "source_ids": [ + 57 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: text\nType: SECTION_TITLE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "l", + "relation_name": "", + "weight": 8.0, + "description": "llm determines the hierarchical level l for each candidate", + "source_ids": [ + 57 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: l\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 8.0, + "description": "llm analyzes the content c of the candidates", + "source_ids": [ + 57 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: c\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "f", + "relation_name": "", + "weight": 8.0, + "description": "llm analyzes the layout features f of the candidates", + "source_ids": [ + 57 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: f\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "b title", + "relation_name": "", + "weight": 9.0, + "description": "the llm analyzes the candidate subset b title to determine properties", + "source_ids": [ + 57 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: b title\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "", + "relation_name": "", + "weight": 7.0, + "description": "the llm uses to identify blocks as title candidates", + "source_ids": [ + 57 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: \nType: UNKNOWN" + }, + { + "src_entity_name": "section filtering phase", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 10.0, + "description": "the section filtering phase uses the llm to analyze title candidates", + "source_ids": [ + 59 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: section filtering phase\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "method", + "relation_name": "", + "weight": 8.0, + "description": "the llm correctly identifies method as a section node", + "source_ids": [ + 59 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: method\nType: SECTION_TITLE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "experiment", + "relation_name": "", + "weight": 8.0, + "description": "the llm correctly identifies experiment as a section node", + "source_ids": [ + 59 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: experiment\nType: SECTION_TITLE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "moe layer", + "relation_name": "", + "weight": 9.0, + "description": "the llm re classifies moe layer from a title to a text node", + "source_ids": [ + 59 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: moe layer\nType: SECTION_TITLE" + }, + { + "src_entity_name": "p dec", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "p dec is used to guide the llm for the decomposition task", + "source_ids": [ + 101 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: p dec\nType: SOFTWARE" + }, + { + "src_entity_name": "p ext", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "p ext is used to guide the llm for the extraction task", + "source_ids": [ + 101 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: p ext\nType: SOFTWARE" + }, + { + "src_entity_name": "q", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 7.0, + "description": "q is the original user query that the llm processes", + "source_ids": [ + 101 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: q\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "prompt", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "prompts are used to guide the llm", + "source_ids": [ + 101 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: prompt\nType: SOFTWARE" + }, + { + "src_entity_name": "decomposition", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "the llm performs the decomposition task", + "source_ids": [ + 101 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: decomposition\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "extraction", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "the llm performs the extraction task", + "source_ids": [ + 101 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "human annotators", + "relation_name": "", + "weight": 8.0, + "description": "the llm generates questions which are then answered and refined by human annotators", + "source_ids": [ + 141 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: human annotators\nType: PERSON" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "table 4", + "relation_name": "", + "weight": 6.0, + "description": "the llm s generated questions contribute to the statistics presented in table 4", + "source_ids": [ + 141 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: table 4\nType: TABLE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "tables", + "relation_name": "", + "weight": 9.0, + "description": "the llm generates questions from tables", + "source_ids": [ + 141 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: tables\nType: TABLE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "figures", + "relation_name": "", + "weight": 9.0, + "description": "the llm generates questions from figures", + "source_ids": [ + 141 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: figures\nType: IMAGE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "global level questions", + "relation_name": "", + "weight": 10.0, + "description": "the llm generates global level questions", + "source_ids": [ + 141 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: global level questions\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "the qwen family includes llms used in the experiments", + "source_ids": [ + 238 + ], + "source": "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: qwen family\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "llms", + "tgt_entity_name": "financial auditing", + "relation_name": "", + "weight": 8.0, + "description": "llms are applied in financial auditing but may miss domain knowledge", + "source_ids": [ + 14 + ], + "source": "Name: financial auditing\nType: TASK_OR_PROBLEM", + "target": "Name: llms\nType: TECHNOLOGY" + }, + { + "src_entity_name": "llms", + "tgt_entity_name": "legal compliance", + "relation_name": "", + "weight": 8.0, + "description": "llms are applied in legal compliance but may miss domain knowledge", + "source_ids": [ + 14 + ], + "source": "Name: legal compliance\nType: TASK_OR_PROBLEM", + "target": "Name: llms\nType: TECHNOLOGY" + }, + { + "src_entity_name": "llms", + "tgt_entity_name": "scientific discovery", + "relation_name": "", + "weight": 8.0, + "description": "llms are applied in scientific discovery but may miss domain knowledge", + "source_ids": [ + 14 + ], + "source": "Name: scientific discovery\nType: TASK_OR_PROBLEM", + "target": "Name: llms\nType: TECHNOLOGY" + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "llms", + "relation_name": "", + "weight": 9.0, + "description": "rag is used to guide llms during response generation to address their limitations", + "source_ids": [ + 14 + ], + "source": "Name: llms\nType: TECHNOLOGY", + "target": "Name: retrieval augmented generation\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "rag", + "tgt_entity_name": "llms", + "relation_name": "", + "weight": 9.0, + "description": "rag is used to guide llms during response generation to address their limitations", + "source_ids": [ + 14 + ], + "source": "Name: llms\nType: TECHNOLOGY", + "target": "Name: rag\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "naive rag technique", + "tgt_entity_name": "llms", + "relation_name": "", + "weight": 10.0, + "description": "the naive rag technique mitigates the hallucination of llms", + "source_ids": [ + 33 + ], + "source": "Name: llms\nType: TECHNOLOGY", + "target": "Name: naive rag technique\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "llms", + "relation_name": "", + "weight": 8.0, + "description": "relying on llms for high accuracy judgments in er methods can lead to prohibitively slow and computationally expensive processes", + "source_ids": [ + 66 + ], + "source": "Name: llms\nType: TECHNOLOGY", + "target": "Name: er methods\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "enterprise scenarios", + "relation_name": "", + "weight": 8.0, + "description": "rag is widely adopted in real world enterprise scenarios", + "source_ids": [ + 14 + ], + "source": "Name: retrieval augmented generation\nType: METHOD_OR_TECHNIQUE", + "target": "Name: enterprise scenarios\nType: LOCATION" + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "external sources", + "relation_name": "", + "weight": 9.0, + "description": "rag retrieves relevant domain knowledge from external sources", + "source_ids": [ + 14 + ], + "source": "Name: retrieval augmented generation\nType: METHOD_OR_TECHNIQUE", + "target": "Name: external sources\nType: LOCATION" + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "response generation", + "relation_name": "", + "weight": 9.0, + "description": "rag is used to guide the llm during response generation", + "source_ids": [ + 14 + ], + "source": "Name: retrieval augmented generation\nType: METHOD_OR_TECHNIQUE", + "target": "Name: response generation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "domain knowledge", + "relation_name": "", + "weight": 9.0, + "description": "rag retrieves domain knowledge to address llm limitations", + "source_ids": [ + 14 + ], + "source": "Name: retrieval augmented generation\nType: METHOD_OR_TECHNIQUE", + "target": "Name: domain knowledge\nType: CONCEPT" + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 10.0, + "description": "g retriever utilizes the retrieval augmented generation method", + "source_ids": [ + 211 + ], + "source": "Name: retrieval augmented generation\nType: METHOD_OR_TECHNIQUE", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "section 3", + "tgt_entity_name": "rag", + "relation_name": "", + "weight": 10.0, + "description": "section 3 introduces the rag workflow", + "source_ids": [ + 29 + ], + "source": "Name: rag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: section 3\nType: SECTION_TITLE" + }, + { + "src_entity_name": "enterprise scenarios", + "tgt_entity_name": "technical handbooks", + "relation_name": "", + "weight": 9.0, + "description": "domain knowledge in enterprise scenarios is often stored in technical handbooks", + "source_ids": [ + 14 + ], + "source": "Name: enterprise scenarios\nType: LOCATION", + "target": "Name: technical handbooks\nType: PRODUCT" + }, + { + "src_entity_name": "enterprise scenarios", + "tgt_entity_name": "api reference manuals", + "relation_name": "", + "weight": 9.0, + "description": "domain knowledge in enterprise scenarios is often stored in api reference manuals", + "source_ids": [ + 14 + ], + "source": "Name: enterprise scenarios\nType: LOCATION", + "target": "Name: api reference manuals\nType: PRODUCT" + }, + { + "src_entity_name": "enterprise scenarios", + "tgt_entity_name": "operational guidebooks", + "relation_name": "", + "weight": 9.0, + "description": "domain knowledge in enterprise scenarios is often stored in operational guidebooks", + "source_ids": [ + 14 + ], + "source": "Name: enterprise scenarios\nType: LOCATION", + "target": "Name: operational guidebooks\nType: PRODUCT" + }, + { + "src_entity_name": "enterprise scenarios", + "tgt_entity_name": "long form documents", + "relation_name": "", + "weight": 9.0, + "description": "domain knowledge in enterprise scenarios is stored in long form documents", + "source_ids": [ + 14 + ], + "source": "Name: enterprise scenarios\nType: LOCATION", + "target": "Name: long form documents\nType: PRODUCT" + }, + { + "src_entity_name": "technical handbooks", + "tgt_entity_name": "books", + "relation_name": "", + "weight": 7.0, + "description": "technical handbooks follow the structure of books", + "source_ids": [ + 14 + ], + "source": "Name: technical handbooks\nType: PRODUCT", + "target": "Name: books\nType: PRODUCT" + }, + { + "src_entity_name": "rag system", + "tgt_entity_name": "technical handbooks", + "relation_name": "", + "weight": 8.0, + "description": "the rag system is designed to handle qa over documents like technical handbooks", + "source_ids": [ + 14 + ], + "source": "Name: technical handbooks\nType: PRODUCT", + "target": "Name: rag system\nType: SOFTWARE" + }, + { + "src_entity_name": "api reference manuals", + "tgt_entity_name": "books", + "relation_name": "", + "weight": 7.0, + "description": "api reference manuals follow the structure of books", + "source_ids": [ + 14 + ], + "source": "Name: api reference manuals\nType: PRODUCT", + "target": "Name: books\nType: PRODUCT" + }, + { + "src_entity_name": "rag system", + "tgt_entity_name": "api reference manuals", + "relation_name": "", + "weight": 8.0, + "description": "the rag system is designed to handle qa over documents like api reference manuals", + "source_ids": [ + 14 + ], + "source": "Name: api reference manuals\nType: PRODUCT", + "target": "Name: rag system\nType: SOFTWARE" + }, + { + "src_entity_name": "operational guidebooks", + "tgt_entity_name": "books", + "relation_name": "", + "weight": 7.0, + "description": "operational guidebooks follow the structure of books", + "source_ids": [ + 14 + ], + "source": "Name: operational guidebooks\nType: PRODUCT", + "target": "Name: books\nType: PRODUCT" + }, + { + "src_entity_name": "rag system", + "tgt_entity_name": "operational guidebooks", + "relation_name": "", + "weight": 8.0, + "description": "the rag system is designed to handle qa over documents like operational guidebooks", + "source_ids": [ + 14 + ], + "source": "Name: operational guidebooks\nType: PRODUCT", + "target": "Name: rag system\nType: SOFTWARE" + }, + { + "src_entity_name": "books", + "tgt_entity_name": "tables of contents", + "relation_name": "", + "weight": 8.0, + "description": "books are characterized by explicit tables of contents", + "source_ids": [ + 14 + ], + "source": "Name: books\nType: PRODUCT", + "target": "Name: tables of contents\nType: PRODUCT" + }, + { + "src_entity_name": "books", + "tgt_entity_name": "nested chapters", + "relation_name": "", + "weight": 8.0, + "description": "books are characterized by nested chapters", + "source_ids": [ + 14 + ], + "source": "Name: books\nType: PRODUCT", + "target": "Name: nested chapters\nType: PRODUCT" + }, + { + "src_entity_name": "books", + "tgt_entity_name": "multi level sections", + "relation_name": "", + "weight": 8.0, + "description": "books are characterized by multi level sections", + "source_ids": [ + 14 + ], + "source": "Name: books\nType: PRODUCT", + "target": "Name: multi level sections\nType: PRODUCT" + }, + { + "src_entity_name": "rag system", + "tgt_entity_name": "qa", + "relation_name": "", + "weight": 10.0, + "description": "the rag system is designed for qa over long and highly structured documents", + "source_ids": [ + 14 + ], + "source": "Name: rag system\nType: SOFTWARE", + "target": "Name: qa\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "this paper", + "tgt_entity_name": "rag system", + "relation_name": "", + "weight": 10.0, + "description": "this paper aims to design an effective rag system", + "source_ids": [ + 14 + ], + "source": "Name: rag system\nType: SOFTWARE", + "target": "Name: this paper\nType: BOOK" + }, + { + "src_entity_name": "rag system", + "tgt_entity_name": "long form documents", + "relation_name": "", + "weight": 9.0, + "description": "the rag system is designed for qa over long and highly structured documents", + "source_ids": [ + 14 + ], + "source": "Name: rag system\nType: SOFTWARE", + "target": "Name: long form documents\nType: PRODUCT" + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "exact match", + "relation_name": "", + "weight": 9.0, + "description": "exact match is a metric used to evaluate the qa task", + "source_ids": [ + 144 + ], + "source": "Name: qa\nType: TASK_OR_PROBLEM", + "target": "Name: exact match\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 9.0, + "description": "accuracy is a metric used to evaluate the qa task", + "source_ids": [ + 144 + ], + "source": "Name: qa\nType: TASK_OR_PROBLEM", + "target": "Name: accuracy\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "token based f1 score", + "relation_name": "", + "weight": 9.0, + "description": "token based f1 score is a metric used to evaluate the qa task", + "source_ids": [ + 144 + ], + "source": "Name: qa\nType: TASK_OR_PROBLEM", + "target": "Name: token based f1 score\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "em", + "relation_name": "", + "weight": 9.0, + "description": "em is used to measure the performance of the qa task", + "source_ids": [ + 170 + ], + "source": "Name: qa\nType: TASK_OR_PROBLEM", + "target": "Name: em\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 9.0, + "description": "f1 is used to measure the performance of the qa task", + "source_ids": [ + 170 + ], + "source": "Name: qa\nType: TASK_OR_PROBLEM", + "target": "Name: f1\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "qa", + "relation_name": "", + "weight": 9.0, + "description": "figure 7 presents the performance breakdown specifically for the qa task", + "source_ids": [ + 177 + ], + "source": "Name: qa\nType: TASK_OR_PROBLEM", + "target": "Name: figure 7\nType: IMAGE" + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 8.0, + "description": "the qa task performance is analyzed across different query types", + "source_ids": [ + 177 + ], + "source": "Name: qa\nType: TASK_OR_PROBLEM", + "target": "Name: query types\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "long form documents", + "tgt_entity_name": "intricate layouts", + "relation_name": "", + "weight": 8.0, + "description": "long form documents are characterized by intricate layouts", + "source_ids": [ + 14 + ], + "source": "Name: long form documents\nType: PRODUCT", + "target": "Name: intricate layouts\nType: SHAPE" + }, + { + "src_entity_name": "long form documents", + "tgt_entity_name": "logical hierarchies", + "relation_name": "", + "weight": 8.0, + "description": "long form documents are characterized by rigorous logical hierarchies", + "source_ids": [ + 14 + ], + "source": "Name: long form documents\nType: PRODUCT", + "target": "Name: logical hierarchies\nType: CONCEPT" + }, + { + "src_entity_name": "rag", + "tgt_entity_name": "ocr", + "relation_name": "", + "weight": 8.0, + "description": "rag approaches generally rely on ocr to convert documents into plain text before application", + "source_ids": [ + 15 + ], + "source": "Name: rag\nType: TECHNOLOGY", + "target": "Name: ocr\nType: TECHNOLOGY" + }, + { + "src_entity_name": "rag", + "tgt_entity_name": "graph based rag", + "relation_name": "", + "weight": 9.0, + "description": "state of the art rag methods increasingly adopt graph based rag approaches", + "source_ids": [ + 15 + ], + "source": "Name: rag\nType: TECHNOLOGY", + "target": "Name: graph based rag\nType: TECHNOLOGY" + }, + { + "src_entity_name": "rag", + "tgt_entity_name": "document level qa", + "relation_name": "", + "weight": 10.0, + "description": "rag approaches are designed for document level qa tasks", + "source_ids": [ + 15 + ], + "source": "Name: rag\nType: TECHNOLOGY", + "target": "Name: document level qa\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "ocr", + "tgt_entity_name": "plain text", + "relation_name": "", + "weight": 10.0, + "description": "ocr converts documents into plain text", + "source_ids": [ + 15 + ], + "source": "Name: ocr\nType: TECHNOLOGY", + "target": "Name: plain text\nType: MATERIAL" + }, + { + "src_entity_name": "text based rag method", + "tgt_entity_name": "graph based rag", + "relation_name": "", + "weight": 9.0, + "description": "graph based rag is a specific type of text based rag method", + "source_ids": [ + 15 + ], + "source": "Name: graph based rag\nType: TECHNOLOGY", + "target": "Name: text based rag method\nType: TECHNOLOGY" + }, + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "graph data", + "relation_name": "", + "weight": 9.0, + "description": "graph based rag uses graph data as an external knowledge source", + "source_ids": [ + 15 + ], + "source": "Name: graph based rag\nType: TECHNOLOGY", + "target": "Name: graph data\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "raptor", + "relation_name": "", + "weight": 9.0, + "description": "raptor is selected as a specific instance of graph based rag methods", + "source_ids": [ + 147 + ], + "source": "Name: graph based rag\nType: TECHNOLOGY", + "target": "Name: raptor\nType: TECHNOLOGY" + }, + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "graphrag", + "relation_name": "", + "weight": 9.0, + "description": "graphrag is selected as a specific instance of graph based rag methods", + "source_ids": [ + 147 + ], + "source": "Name: graph based rag\nType: TECHNOLOGY", + "target": "Name: graphrag\nType: TECHNOLOGY" + }, + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "documents", + "relation_name": "", + "weight": 9.0, + "description": "graph based rag extracts textual content from documents", + "source_ids": [ + 147 + ], + "source": "Name: graph based rag\nType: TECHNOLOGY", + "target": "Name: documents\nType: PRODUCT" + }, + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "graph data", + "relation_name": "", + "weight": 9.0, + "description": "graph based rag leverages graph data during retrieval", + "source_ids": [ + 147 + ], + "source": "Name: graph based rag\nType: TECHNOLOGY", + "target": "Name: graph data\nType: TECHNOLOGY" + }, + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 8.0, + "description": "graph based rag performs retrieval as part of its process", + "source_ids": [ + 147 + ], + "source": "Name: graph based rag\nType: TECHNOLOGY", + "target": "Name: retrieval\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "leiden community detection algorithm", + "relation_name": "", + "weight": 10.0, + "description": "graphrag applies the leiden community detection algorithm to obtain hierarchical clusters", + "source_ids": [ + 15 + ], + "source": "Name: graphrag\nType: PRODUCT", + "target": "Name: leiden community detection algorithm\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "table 1", + "tgt_entity_name": "graphrag", + "relation_name": "", + "weight": 9.0, + "description": "table 1 lists graphrag as a representative method", + "source_ids": [ + 15 + ], + "source": "Name: graphrag\nType: PRODUCT", + "target": "Name: table 1\nType: TABLE" + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "knowledge graph", + "relation_name": "", + "weight": 10.0, + "description": "graphrag constructs a knowledge graph from a textual corpus", + "source_ids": [ + 15 + ], + "source": "Name: graphrag\nType: PRODUCT", + "target": "Name: knowledge graph\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "textual corpus", + "relation_name": "", + "weight": 9.0, + "description": "graphrag uses a textual corpus as the source for constructing a knowledge graph", + "source_ids": [ + 15 + ], + "source": "Name: graphrag\nType: PRODUCT", + "target": "Name: textual corpus\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "summaries", + "relation_name": "", + "weight": 9.0, + "description": "graphrag generates summaries for each community", + "source_ids": [ + 15 + ], + "source": "Name: graphrag\nType: PRODUCT", + "target": "Name: summaries\nType: PRODUCT" + }, + { + "src_entity_name": "table 1", + "tgt_entity_name": "raptor", + "relation_name": "", + "weight": 9.0, + "description": "table 1 lists raptor as a representative method", + "source_ids": [ + 15 + ], + "source": "Name: raptor\nType: PRODUCT", + "target": "Name: table 1\nType: TABLE" + }, + { + "src_entity_name": "raptor", + "tgt_entity_name": "recursive tree structure", + "relation_name": "", + "weight": 10.0, + "description": "raptor builds a recursive tree structure", + "source_ids": [ + 15 + ], + "source": "Name: raptor\nType: PRODUCT", + "target": "Name: recursive tree structure\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "raptor", + "tgt_entity_name": "document chunks", + "relation_name": "", + "weight": 10.0, + "description": "raptor iteratively clusters document chunks", + "source_ids": [ + 15 + ], + "source": "Name: raptor\nType: PRODUCT", + "target": "Name: document chunks\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "raptor", + "tgt_entity_name": "fine grained semantic information", + "relation_name": "", + "weight": 9.0, + "description": "raptor captures fine grained semantic information across the corpus", + "source_ids": [ + 15 + ], + "source": "Name: raptor\nType: PRODUCT", + "target": "Name: fine grained semantic information\nType: CONCEPT" + }, + { + "src_entity_name": "raptor", + "tgt_entity_name": "high level semantic information", + "relation_name": "", + "weight": 9.0, + "description": "raptor captures high level semantic information across the corpus", + "source_ids": [ + 15 + ], + "source": "Name: raptor\nType: PRODUCT", + "target": "Name: high level semantic information\nType: CONCEPT" + }, + { + "src_entity_name": "leiden community detection algorithm", + "tgt_entity_name": "hierarchical clusters", + "relation_name": "", + "weight": 10.0, + "description": "the leiden community detection algorithm produces hierarchical clusters", + "source_ids": [ + 15 + ], + "source": "Name: leiden community detection algorithm\nType: METHOD_OR_TECHNIQUE", + "target": "Name: hierarchical clusters\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "table 1", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Table 1", + "source_ids": [ + 182 + ], + "source": "Name: table 1\nType: TABLE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "knowledge graph", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 10.0, + "description": "The Knowledge Graph is the primary object being constructed in this section.", + "source_ids": [ + 63 + ], + "source": "Name: knowledge graph\nType: DATASET_OR_CORPUS", + "target": "Name: 4.3.1 kg construction\nType: SECTION_TITLE" + }, + { + "src_entity_name": "table: cref='#/texts/17'...", + "tgt_entity_name": "texts reference", + "relation_name": "", + "weight": 9.0, + "description": "Table 'Table: cref='#/texts/17'...' contains data about 'Texts Reference'.", + "source_ids": [ + 17 + ], + "source": "Name: table: cref='#/texts/17'...\nType: TABLE", + "target": "Name: texts reference\nType: SECTION_TITLE" + }, + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "paragraphs", + "relation_name": "", + "weight": 9.0, + "description": "layout aware segmentation parses documents into paragraphs to preserve their structure", + "source_ids": [ + 18 + ], + "source": "Name: layout aware segmentation\nType: TASK_OR_PROBLEM", + "target": "Name: paragraphs\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "tables", + "relation_name": "", + "weight": 9.0, + "description": "layout aware segmentation parses documents into tables to preserve their structure", + "source_ids": [ + 18 + ], + "source": "Name: layout aware segmentation\nType: TASK_OR_PROBLEM", + "target": "Name: tables\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "figures", + "relation_name": "", + "weight": 9.0, + "description": "layout aware segmentation parses documents into figures to preserve their structure", + "source_ids": [ + 18 + ], + "source": "Name: layout aware segmentation\nType: TASK_OR_PROBLEM", + "target": "Name: figures\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "equations", + "relation_name": "", + "weight": 9.0, + "description": "layout aware segmentation parses documents into equations to preserve their structure", + "source_ids": [ + 18 + ], + "source": "Name: layout aware segmentation\nType: TASK_OR_PROBLEM", + "target": "Name: equations\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "multimodal retrieval", + "relation_name": "", + "weight": 8.0, + "description": "multimodal retrieval is a typical approach applied to blocks generated by layout aware segmentation", + "source_ids": [ + 18 + ], + "source": "Name: layout aware segmentation\nType: TASK_OR_PROBLEM", + "target": "Name: multimodal retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "layout aware segmentation", + "relation_name": "", + "weight": 9.0, + "description": "docetl is a state of the art method within the category of layout aware segmentation", + "source_ids": [ + 18 + ], + "source": "Name: layout aware segmentation\nType: TASK_OR_PROBLEM", + "target": "Name: docetl\nType: SOFTWARE" + }, + { + "src_entity_name": "second paradigm", + "tgt_entity_name": "layout aware segmentation", + "relation_name": "", + "weight": 10.0, + "description": "the second paradigm is identified as layout aware segmentation in the text", + "source_ids": [ + 18 + ], + "source": "Name: layout aware segmentation\nType: TASK_OR_PROBLEM", + "target": "Name: second paradigm\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "document native structural information", + "relation_name": "", + "weight": 9.0, + "description": "layout aware segmentation retains document native structural information", + "source_ids": [ + 18 + ], + "source": "Name: layout aware segmentation\nType: TASK_OR_PROBLEM", + "target": "Name: document native structural information\nType: CONCEPT" + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "declarative interface", + "relation_name": "", + "weight": 10.0, + "description": "docetl provides a declarative interface for users", + "source_ids": [ + 18 + ], + "source": "Name: docetl\nType: SOFTWARE", + "target": "Name: declarative interface\nType: SOFTWARE" + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "docetl is an llm based system for optimizing information extraction tasks", + "source_ids": [ + 32 + ], + "source": "Name: docetl\nType: SOFTWARE", + "target": "Name: llm\nType: TECHNOLOGY" + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "agentic framework", + "relation_name": "", + "weight": 10.0, + "description": "docetl introduces an agentic framework", + "source_ids": [ + 32 + ], + "source": "Name: docetl\nType: SOFTWARE", + "target": "Name: agentic framework\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "information extraction", + "relation_name": "", + "weight": 10.0, + "description": "docetl is designed to optimize complex information extraction tasks", + "source_ids": [ + 32 + ], + "source": "Name: docetl\nType: SOFTWARE", + "target": "Name: information extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "layoutsegmentedrag", + "tgt_entity_name": "docetl", + "relation_name": "", + "weight": 9.0, + "description": "docetl is included as a method within the layoutsegmentedrag category", + "source_ids": [ + 148 + ], + "source": "Name: docetl\nType: SOFTWARE", + "target": "Name: layoutsegmentedrag\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "page 47", + "relation_name": "", + "weight": 5.0, + "description": "docetl is referenced in citation page 47", + "source_ids": [ + 148 + ], + "source": "Name: docetl\nType: SOFTWARE", + "target": "Name: page 47\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "docetl", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to DocETL", + "source_ids": [ + 159 + ], + "source": "Name: docetl\nType: SOFTWARE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "multimodal retrieval", + "tgt_entity_name": "relevant content", + "relation_name": "", + "weight": 9.0, + "description": "multimodal retrieval is used to obtain relevant content", + "source_ids": [ + 18 + ], + "source": "Name: multimodal retrieval\nType: METHOD_OR_TECHNIQUE", + "target": "Name: relevant content\nType: CONCEPT" + }, + { + "src_entity_name": "multimodal retrieval", + "tgt_entity_name": "queries", + "relation_name": "", + "weight": 8.0, + "description": "multimodal retrieval is applied to answer queries", + "source_ids": [ + 18 + ], + "source": "Name: multimodal retrieval\nType: METHOD_OR_TECHNIQUE", + "target": "Name: queries\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "first paradigm", + "tgt_entity_name": "fixed chunk size", + "relation_name": "", + "weight": 9.0, + "description": "the first paradigm uses a fixed chunk size which leads to fragmented information", + "source_ids": [ + 18 + ], + "source": "Name: first paradigm\nType: TASK_OR_PROBLEM", + "target": "Name: fixed chunk size\nType: MEASUREMENT" + }, + { + "src_entity_name": "declarative interface", + "tgt_entity_name": "processing pipelines", + "relation_name": "", + "weight": 9.0, + "description": "the declarative interface allows users to define processing pipelines", + "source_ids": [ + 18 + ], + "source": "Name: declarative interface\nType: SOFTWARE", + "target": "Name: processing pipelines\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "processing pipelines", + "tgt_entity_name": "llm powered operations", + "relation_name": "", + "weight": 9.0, + "description": "processing pipelines consist of llm powered operations", + "source_ids": [ + 18 + ], + "source": "Name: processing pipelines\nType: TASK_OR_PROBLEM", + "target": "Name: llm powered operations\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "processing pipelines", + "tgt_entity_name": "task specific optimizations", + "relation_name": "", + "weight": 9.0, + "description": "processing pipelines include task specific optimizations", + "source_ids": [ + 18 + ], + "source": "Name: processing pipelines\nType: TASK_OR_PROBLEM", + "target": "Name: task specific optimizations\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "queries", + "relation_name": "", + "weight": 9.0, + "description": "the agent based retrieval approach dynamically classifies queries", + "source_ids": [ + 26 + ], + "source": "Name: queries\nType: TASK_OR_PROBLEM", + "target": "Name: agent based retrieval\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "text based approaches", + "tgt_entity_name": "l1", + "relation_name": "", + "weight": 9.0, + "description": "text based approaches suffer from the limitation l1", + "source_ids": [ + 19 + ], + "source": "Name: l1\nType: TASK_OR_PROBLEM", + "target": "Name: text based approaches\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "l1", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "l1 concerns the failure to capture the deep connection of document structure and semantics", + "source_ids": [ + 19 + ], + "source": "Name: l1\nType: TASK_OR_PROBLEM", + "target": "Name: document\nType: PRODUCT" + }, + { + "src_entity_name": "l2", + "tgt_entity_name": "static or manually predefined workflows", + "relation_name": "", + "weight": 9.0, + "description": "l2 is caused by the application of static or manually predefined workflows to diverse query needs", + "source_ids": [ + 19 + ], + "source": "Name: l2\nType: TASK_OR_PROBLEM", + "target": "Name: static or manually predefined workflows\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "layout segmented methods", + "tgt_entity_name": "l2", + "relation_name": "", + "weight": 8.0, + "description": "layout segmented methods contribute to the limitation l2 by failing to capture relationships between blocks", + "source_ids": [ + 19 + ], + "source": "Name: l2\nType: TASK_OR_PROBLEM", + "target": "Name: layout segmented methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "text based approaches", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "text based approaches analyze the document but fail to capture its structural layout", + "source_ids": [ + 19 + ], + "source": "Name: text based approaches\nType: METHOD_OR_TECHNIQUE", + "target": "Name: document\nType: PRODUCT" + }, + { + "src_entity_name": "layout segmented methods", + "tgt_entity_name": "hierarchical blocks", + "relation_name": "", + "weight": 8.0, + "description": "layout segmented methods preserve hierarchical blocks but fail to capture relationships between them", + "source_ids": [ + 19 + ], + "source": "Name: layout segmented methods\nType: METHOD_OR_TECHNIQUE", + "target": "Name: hierarchical blocks\nType: CONCEPT" + }, + { + "src_entity_name": "layout segmented methods", + "tgt_entity_name": "multi hop reasoning", + "relation_name": "", + "weight": 8.0, + "description": "layout segmented methods limit the capability for multi hop reasoning across blocks", + "source_ids": [ + 19 + ], + "source": "Name: layout segmented methods\nType: METHOD_OR_TECHNIQUE", + "target": "Name: multi hop reasoning\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "real world qa scenarios", + "tgt_entity_name": "static or manually predefined workflows", + "relation_name": "", + "weight": 8.0, + "description": "real world qa scenarios involve diverse queries that make static or manually predefined workflows inefficient", + "source_ids": [ + 19 + ], + "source": "Name: real world qa scenarios\nType: EVENT", + "target": "Name: static or manually predefined workflows\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "user queries", + "tgt_entity_name": "real world qa scenarios", + "relation_name": "", + "weight": 9.0, + "description": "user queries are the inputs found within real world qa scenarios", + "source_ids": [ + 19 + ], + "source": "Name: real world qa scenarios\nType: EVENT", + "target": "Name: user queries\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "static or manually predefined workflows", + "tgt_entity_name": "overall performance", + "relation_name": "", + "weight": 7.0, + "description": "applying static workflows to diverse needs affects the overall performance negatively", + "source_ids": [ + 19 + ], + "source": "Name: static or manually predefined workflows\nType: METHOD_OR_TECHNIQUE", + "target": "Name: overall performance\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "complex queries", + "tgt_entity_name": "question decomposition", + "relation_name": "", + "weight": 7.0, + "description": "complex queries often require question decomposition", + "source_ids": [ + 19 + ], + "source": "Name: question decomposition\nType: METHOD_OR_TECHNIQUE", + "target": "Name: complex queries\nType: UNKNOWN" + }, + { + "src_entity_name": "simple queries", + "tgt_entity_name": "question decomposition", + "relation_name": "", + "weight": 7.0, + "description": "simple queries do not require question decomposition", + "source_ids": [ + 19 + ], + "source": "Name: question decomposition\nType: METHOD_OR_TECHNIQUE", + "target": "Name: simple queries\nType: UNKNOWN" + }, + { + "src_entity_name": "method s", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 10.0, + "description": "method s maps a structured document to a final answer", + "source_ids": [ + 37 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: method s\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "document", + "tgt_entity_name": "pages", + "relation_name": "", + "weight": 10.0, + "description": "a document is represented as a sequence of pages", + "source_ids": [ + 37 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: pages\nType: MEASUREMENT" + }, + { + "src_entity_name": "document", + "tgt_entity_name": "content blocks", + "relation_name": "", + "weight": 10.0, + "description": "pages in a document collectively contain a sequence of content blocks", + "source_ids": [ + 37 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: content blocks\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "document", + "tgt_entity_name": "n", + "relation_name": "", + "weight": 9.0, + "description": "n defines the sequence length of pages in the document", + "source_ids": [ + 37 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: n\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "document", + "tgt_entity_name": "m", + "relation_name": "", + "weight": 9.0, + "description": "m defines the sequence length of content blocks in the document", + "source_ids": [ + 37 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: m\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "document", + "tgt_entity_name": "p", + "relation_name": "", + "weight": 9.0, + "description": "p represents an individual page within the document sequence", + "source_ids": [ + 37 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: p\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "document", + "tgt_entity_name": "b", + "relation_name": "", + "weight": 9.0, + "description": "b represents an individual content block within the document", + "source_ids": [ + 37 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: b\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "document", + "tgt_entity_name": "d", + "relation_name": "", + "weight": 10.0, + "description": "d is the variable symbol for the document", + "source_ids": [ + 37 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: d\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "bookindex is designed to operate on complex documents to capture their internal structures", + "source_ids": [ + 47 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: bookindex\nType: PRODUCT" + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "the tree structure is derived from the document s explicit logical hierarchy", + "source_ids": [ + 51 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: tree structure\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "knowledge graph", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "the knowledge graph captures entities and relations scattered throughout the document", + "source_ids": [ + 51 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: knowledge graph\nType: SOFTWARE" + }, + { + "src_entity_name": "titles", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "titles are part of the document s explicit logical hierarchy", + "source_ids": [ + 51 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: titles\nType: SECTION_TITLE" + }, + { + "src_entity_name": "sections", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "sections are part of the document s explicit logical hierarchy", + "source_ids": [ + 51 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: sections\nType: SECTION_TITLE" + }, + { + "src_entity_name": "tables", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "tables are part of the document s explicit logical hierarchy", + "source_ids": [ + 51 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: tables\nType: TABLE" + }, + { + "src_entity_name": "tree component", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "the tree component organizes the document into a hierarchical structure", + "source_ids": [ + 52 + ], + "source": "Name: document\nType: PRODUCT", + "target": "Name: tree component\nType: SOFTWARE" + }, + { + "src_entity_name": "tables", + "tgt_entity_name": "section", + "relation_name": "", + "weight": 9.0, + "description": "tables are nested within a specific section of the document", + "source_ids": [ + 19 + ], + "source": "Name: tables\nType: TABLE", + "target": "Name: section\nType: SECTION_TITLE" + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "tables", + "relation_name": "", + "weight": 9.0, + "description": "tables are examples of nodes included in the tree structure", + "source_ids": [ + 51 + ], + "source": "Name: tables\nType: TABLE", + "target": "Name: tree structure\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "tables", + "tgt_entity_name": "images", + "relation_name": "", + "weight": 6.0, + "description": "both are types of pdf blocks manually labeled to establish ground truth", + "source_ids": [ + 144 + ], + "source": "Name: tables\nType: TABLE", + "target": "Name: images\nType: TABLE" + }, + { + "src_entity_name": "section", + "tgt_entity_name": "research problem", + "relation_name": "", + "weight": 9.0, + "description": "the section is the location where the research problem is formalized", + "source_ids": [ + 35 + ], + "source": "Name: section\nType: SECTION_TITLE", + "target": "Name: research problem\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "section", + "tgt_entity_name": "general workflow", + "relation_name": "", + "weight": 9.0, + "description": "the section is the location where the general workflow is reviewed", + "source_ids": [ + 35 + ], + "source": "Name: section\nType: SECTION_TITLE", + "target": "Name: general workflow\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "filters", + "tgt_entity_name": "section", + "relation_name": "", + "weight": 8.0, + "description": "filters can be of type section to target structural parts like chapters or appendices", + "source_ids": [ + 258 + ], + "source": "Name: section\nType: SECTION_TITLE", + "target": "Name: filters\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "section", + "tgt_entity_name": "chapter", + "relation_name": "", + "weight": 9.0, + "description": "chapters are examples of sections", + "source_ids": [ + 258 + ], + "source": "Name: section\nType: SECTION_TITLE", + "target": "Name: chapter\nType: SECTION_TITLE" + }, + { + "src_entity_name": "section", + "tgt_entity_name": "appendices", + "relation_name": "", + "weight": 9.0, + "description": "appendices are examples of sections", + "source_ids": [ + 258 + ], + "source": "Name: section\nType: SECTION_TITLE", + "target": "Name: appendices\nType: SECTION_TITLE" + }, + { + "src_entity_name": "section", + "tgt_entity_name": "references", + "relation_name": "", + "weight": 9.0, + "description": "references are examples of sections", + "source_ids": [ + 258 + ], + "source": "Name: section\nType: SECTION_TITLE", + "target": "Name: references\nType: SECTION_TITLE" + }, + { + "src_entity_name": "user queries", + "tgt_entity_name": "keyword lookups", + "relation_name": "", + "weight": 8.0, + "description": "keyword lookups are a type of user query mentioned in the text", + "source_ids": [ + 19 + ], + "source": "Name: user queries\nType: TASK_OR_PROBLEM", + "target": "Name: keyword lookups\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "user queries", + "tgt_entity_name": "multi hop questions", + "relation_name": "", + "weight": 8.0, + "description": "multi hop questions are a type of user query mentioned in the text", + "source_ids": [ + 19 + ], + "source": "Name: user queries\nType: TASK_OR_PROBLEM", + "target": "Name: multi hop questions\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "user queries", + "tgt_entity_name": "retrieval workflows", + "relation_name": "", + "weight": 9.0, + "description": "user queries are classified to dynamically generate tailored retrieval workflows", + "source_ids": [ + 22 + ], + "source": "Name: user queries\nType: TASK_OR_PROBLEM", + "target": "Name: retrieval workflows\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "user queries", + "relation_name": "", + "weight": 10.0, + "description": "the agent classifies user queries based on their intent and complexity", + "source_ids": [ + 22 + ], + "source": "Name: user queries\nType: TASK_OR_PROBLEM", + "target": "Name: agent\nType: UNKNOWN" + }, + { + "src_entity_name": "multi hop questions", + "tgt_entity_name": "evidence", + "relation_name": "", + "weight": 9.0, + "description": "multi hop questions require synthesizing evidence scattered across the document", + "source_ids": [ + 19 + ], + "source": "Name: multi hop questions\nType: TASK_OR_PROBLEM", + "target": "Name: evidence\nType: CONCEPT" + }, + { + "src_entity_name": "reasoner", + "tgt_entity_name": "evidence", + "relation_name": "", + "weight": 9.0, + "description": "the reasoner locates highly relevant evidence", + "source_ids": [ + 22 + ], + "source": "Name: evidence\nType: CONCEPT", + "target": "Name: reasoner\nType: SOFTWARE" + }, + { + "src_entity_name": "multi hop reasoning", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 10.0, + "description": "multi hop reasoning relies on a high quality kg for its execution", + "source_ids": [ + 21 + ], + "source": "Name: multi hop reasoning\nType: TASK_OR_PROBLEM", + "target": "Name: kg\nType: CONCEPT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is built upon the document native bookindex", + "source_ids": [ + 20 + ], + "source": "Name: bookrag\nType: TECHNOLOGY", + "target": "Name: bookindex\nType: PRODUCT" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "document qa tasks", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is designed specifically for document qa tasks", + "source_ids": [ + 20 + ], + "source": "Name: bookrag\nType: TECHNOLOGY", + "target": "Name: document qa tasks\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "relation", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is designed to capture the deep connection of the relation in the document", + "source_ids": [ + 20 + ], + "source": "Name: bookrag\nType: TECHNOLOGY", + "target": "Name: relation\nType: CONCEPT" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "hierarchical tree structure", + "relation_name": "", + "weight": 9.0, + "description": "bookindex organizes information using a hierarchical tree structure to preserve logical hierarchy", + "source_ids": [ + 20 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: hierarchical tree structure\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "bookindex constructs a kg to capture intricate relations within document blocks", + "source_ids": [ + 20 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: kg\nType: TECHNOLOGY" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "parsed content blocks", + "relation_name": "", + "weight": 9.0, + "description": "bookindex organizes parsed content blocks into a hierarchical tree structure", + "source_ids": [ + 20 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: parsed content blocks\nType: MATERIAL" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 8.0, + "description": "the system builds upon bookindex to implement an agent that uses selector for retrieval workflows", + "source_ids": [ + 22 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: selector\nType: SOFTWARE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 8.0, + "description": "the system builds upon bookindex to implement an agent that uses reasoner for retrieval workflows", + "source_ids": [ + 22 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: reasoner\nType: SOFTWARE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "hierarchical tree", + "relation_name": "", + "weight": 8.0, + "description": "the bookindex is constructed using a hierarchical tree of document layout blocks", + "source_ids": [ + 25 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: hierarchical tree\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 8.0, + "description": "the bookindex is constructed using a kg storing fine grained entity relations", + "source_ids": [ + 25 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: kg\nType: SOFTWARE" + }, + { + "src_entity_name": "section 4", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "section 4 presents the structure and construction of bookindex", + "source_ids": [ + 29 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: section 4\nType: SECTION_TITLE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "tree construction", + "relation_name": "", + "weight": 9.0, + "description": "bookindex utilizes tree construction as its first stage to parse document layout and establish hierarchical nodes", + "source_ids": [ + 47 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: tree construction\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "graph construction", + "relation_name": "", + "weight": 9.0, + "description": "bookindex utilizes graph construction as its second stage to extract and refine entity knowledge", + "source_ids": [ + 47 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: graph construction\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "logical hierarchy", + "relation_name": "", + "weight": 10.0, + "description": "bookindex is explicitly designed to capture the explicit logical hierarchy found in documents", + "source_ids": [ + 47 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: logical hierarchy\nType: CONCEPT" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "entity relations", + "relation_name": "", + "weight": 10.0, + "description": "bookindex is explicitly designed to capture the intricate entity relations found in documents", + "source_ids": [ + 47 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: entity relations\nType: CONCEPT" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to BookIndex", + "source_ids": [ + 49 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "tree structure", + "relation_name": "", + "weight": 10.0, + "description": "bookindex is defined as a triplet that includes the tree structure as one of its components", + "source_ids": [ + 51 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: tree structure\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "knowledge graph", + "relation_name": "", + "weight": 10.0, + "description": "bookindex is defined as a triplet that includes the knowledge graph as one of its components", + "source_ids": [ + 51 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: knowledge graph\nType: SOFTWARE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "graph tree link", + "relation_name": "", + "weight": 10.0, + "description": "bookindex is defined as a triplet that includes the graph tree link as one of its components", + "source_ids": [ + 51 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: graph tree link\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "", + "relation_name": "", + "weight": 10.0, + "description": "is the first component of the bookindex triplet definition", + "source_ids": [ + 51 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: \nType: UNKNOWN" + }, + { + "src_entity_name": "figure 2", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "figure 2 provides an example of the bookindex", + "source_ids": [ + 52 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: figure 2\nType: IMAGE" + }, + { + "src_entity_name": "tree component", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "the tree component is a part of the bookindex", + "source_ids": [ + 52 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: tree component\nType: SOFTWARE" + }, + { + "src_entity_name": "graph component", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "the graph component is a part of the bookindex", + "source_ids": [ + 52 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: graph component\nType: SOFTWARE" + }, + { + "src_entity_name": "gt link", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "gt link is formalized to complete the bookindex", + "source_ids": [ + 77 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: gt link\nType: TECHNOLOGY" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 7.0, + "description": "g is a component of the bookindex structure b", + "source_ids": [ + 77 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: g\nType: CONCEPT" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "t", + "relation_name": "", + "weight": 7.0, + "description": "t is a component of the bookindex structure b", + "source_ids": [ + 77 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: t\nType: CONCEPT" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "m", + "relation_name": "", + "weight": 7.0, + "description": "m is a component of the bookindex structure b", + "source_ids": [ + 77 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: m\nType: UNKNOWN" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "v i", + "relation_name": "", + "weight": 6.0, + "description": "the bookindex structure b involves the recording of origin nodes for entities like v i", + "source_ids": [ + 77 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: v i\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "bookindex operators are designed specifically for the bookindex system", + "source_ids": [ + 97 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: bookindex operators\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "selector operators filter content ranges directly from the bookindex", + "source_ids": [ + 102 + ], + "source": "Name: bookindex\nType: PRODUCT", + "target": "Name: selector\nType: TECHNOLOGY" + }, + { + "src_entity_name": "document qa tasks", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 7.0, + "description": "accuracy is measured specifically on document qa tasks", + "source_ids": [ + 137 + ], + "source": "Name: document qa tasks\nType: TASK_OR_PROBLEM", + "target": "Name: accuracy\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "hierarchical tree structure", + "tgt_entity_name": "table of contents", + "relation_name": "", + "weight": 8.0, + "description": "the hierarchical tree structure serves the role of the document s table of contents", + "source_ids": [ + 20 + ], + "source": "Name: hierarchical tree structure\nType: METHOD_OR_TECHNIQUE", + "target": "Name: table of contents\nType: PRODUCT" + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "hierarchical tree structure", + "relation_name": "", + "weight": 8.0, + "description": "the kg entities are mapped to their corresponding tree nodes to unify the two structures", + "source_ids": [ + 20 + ], + "source": "Name: hierarchical tree structure\nType: METHOD_OR_TECHNIQUE", + "target": "Name: kg\nType: TECHNOLOGY" + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "fine grained entities", + "relation_name": "", + "weight": 9.0, + "description": "the kg is constructed containing fine grained entities to capture intricate relations", + "source_ids": [ + 20 + ], + "source": "Name: kg\nType: TECHNOLOGY", + "target": "Name: fine grained entities\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "tree nodes", + "relation_name": "", + "weight": 9.0, + "description": "kg entities are mapped to their corresponding tree nodes to unify the structures", + "source_ids": [ + 20 + ], + "source": "Name: kg\nType: TECHNOLOGY", + "target": "Name: tree nodes\nType: PRODUCT" + }, + { + "src_entity_name": "parsed content blocks", + "tgt_entity_name": "tree nodes", + "relation_name": "", + "weight": 7.0, + "description": "parsed content blocks are organized into the hierarchical tree structure which consists of tree nodes", + "source_ids": [ + 20 + ], + "source": "Name: parsed content blocks\nType: MATERIAL", + "target": "Name: tree nodes\nType: PRODUCT" + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "gradient based entity resolution method", + "relation_name": "", + "weight": 9.0, + "description": "the gradient based entity resolution method is proposed to ensure the high quality of the kg by resolving entity ambiguity", + "source_ids": [ + 21 + ], + "source": "Name: kg\nType: CONCEPT", + "target": "Name: gradient based entity resolution method\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "entity ambiguity", + "relation_name": "", + "weight": 9.0, + "description": "entity ambiguity compromises the quality of the kg", + "source_ids": [ + 21 + ], + "source": "Name: kg\nType: CONCEPT", + "target": "Name: entity ambiguity\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "large language model", + "relation_name": "", + "weight": 8.0, + "description": "llm and large language model are cited as examples of distinct entities that cause ambiguity in the kg", + "source_ids": [ + 21 + ], + "source": "Name: llm\nType: PRODUCT", + "target": "Name: large language model\nType: PRODUCT" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "entity ambiguity", + "relation_name": "", + "weight": 7.0, + "description": "llm is an example of a name that contributes to entity ambiguity", + "source_ids": [ + 21 + ], + "source": "Name: llm\nType: PRODUCT", + "target": "Name: entity ambiguity\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "large language model", + "tgt_entity_name": "entity ambiguity", + "relation_name": "", + "weight": 7.0, + "description": "large language model is an example of a name that contributes to entity ambiguity", + "source_ids": [ + 21 + ], + "source": "Name: large language model\nType: PRODUCT", + "target": "Name: entity ambiguity\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gradient based entity resolution method", + "tgt_entity_name": "similarity distribution", + "relation_name": "", + "weight": 9.0, + "description": "the method analyzes the similarity distribution of candidate entities to function", + "source_ids": [ + 21 + ], + "source": "Name: gradient based entity resolution method\nType: METHOD_OR_TECHNIQUE", + "target": "Name: similarity distribution\nType: CONCEPT" + }, + { + "src_entity_name": "gradient based entity resolution method", + "tgt_entity_name": "candidate entities", + "relation_name": "", + "weight": 9.0, + "description": "the method analyzes candidate entities to identify sharp drops in similarity scores", + "source_ids": [ + 21 + ], + "source": "Name: gradient based entity resolution method\nType: METHOD_OR_TECHNIQUE", + "target": "Name: candidate entities\nType: CONCEPT" + }, + { + "src_entity_name": "gradient based entity resolution method", + "tgt_entity_name": "coreferent entities", + "relation_name": "", + "weight": 10.0, + "description": "the method distinguishes and merges coreferent entities", + "source_ids": [ + 21 + ], + "source": "Name: gradient based entity resolution method\nType: METHOD_OR_TECHNIQUE", + "target": "Name: coreferent entities\nType: CONCEPT" + }, + { + "src_entity_name": "gradient based entity resolution method", + "tgt_entity_name": "graph connectivity", + "relation_name": "", + "weight": 8.0, + "description": "the method ensures graph connectivity by resolving entity ambiguity", + "source_ids": [ + 21 + ], + "source": "Name: gradient based entity resolution method\nType: METHOD_OR_TECHNIQUE", + "target": "Name: graph connectivity\nType: CONCEPT" + }, + { + "src_entity_name": "gradient based entity resolution method", + "tgt_entity_name": "reasoning capabilities", + "relation_name": "", + "weight": 8.0, + "description": "the method enhances reasoning capabilities by improving the kg", + "source_ids": [ + 21 + ], + "source": "Name: gradient based entity resolution method\nType: METHOD_OR_TECHNIQUE", + "target": "Name: reasoning capabilities\nType: CONCEPT" + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "gradient based entity resolution method", + "relation_name": "", + "weight": 8.0, + "description": "graph construction refines entity knowledge using the novel gradient based entity resolution method", + "source_ids": [ + 47 + ], + "source": "Name: gradient based entity resolution method\nType: METHOD_OR_TECHNIQUE", + "target": "Name: graph construction\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 8.0, + "description": "the selector operator narrows the document space which is subsequently analyzed by the reasoner operator", + "source_ids": [ + 124 + ], + "source": "Name: selector\nType: SOFTWARE", + "target": "Name: reasoner\nType: SOFTWARE" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "search space", + "relation_name": "", + "weight": 9.0, + "description": "the selector narrows down the search space", + "source_ids": [ + 22 + ], + "source": "Name: selector\nType: SOFTWARE", + "target": "Name: search space\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "information scents", + "relation_name": "", + "weight": 9.0, + "description": "the selector uses information scents to narrow down the search space", + "source_ids": [ + 22 + ], + "source": "Name: selector\nType: SOFTWARE", + "target": "Name: information scents\nType: CONCEPT" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 10.0, + "description": "the selector operator navigates to information patches", + "source_ids": [ + 124 + ], + "source": "Name: selector\nType: SOFTWARE", + "target": "Name: information patches\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "document space", + "relation_name": "", + "weight": 9.0, + "description": "the selector narrows the vast document space down to relevant scopes", + "source_ids": [ + 124 + ], + "source": "Name: selector\nType: SOFTWARE", + "target": "Name: document space\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "relevant scopes", + "relation_name": "", + "weight": 9.0, + "description": "the selector narrows the document space down to relevant scopes", + "source_ids": [ + 124 + ], + "source": "Name: selector\nType: SOFTWARE", + "target": "Name: relevant scopes\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "ift inspired selector reasoner workflow", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 9.0, + "description": "the workflow uses the selector to narrow the search to a precise information patch", + "source_ids": [ + 157 + ], + "source": "Name: selector\nType: SOFTWARE", + "target": "Name: ift inspired selector reasoner workflow\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "information patch", + "relation_name": "", + "weight": 9.0, + "description": "the selector narrows the search to a precise information patch", + "source_ids": [ + 157 + ], + "source": "Name: selector\nType: SOFTWARE", + "target": "Name: information patch\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "reasoner", + "tgt_entity_name": "synthesizer", + "relation_name": "", + "weight": 8.0, + "description": "the reasoner operator refines information that is then used by the synthesizer to generate the answer", + "source_ids": [ + 124 + ], + "source": "Name: reasoner\nType: SOFTWARE", + "target": "Name: synthesizer\nType: SOFTWARE" + }, + { + "src_entity_name": "reasoner", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 9.0, + "description": "the reasoner performs sensemaking within the information patches identified by the selector", + "source_ids": [ + 124 + ], + "source": "Name: reasoner\nType: SOFTWARE", + "target": "Name: information patches\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "reasoner", + "tgt_entity_name": "processed evidence", + "relation_name": "", + "weight": 9.0, + "description": "the reasoner analyzes and refines information to create processed evidence", + "source_ids": [ + 124 + ], + "source": "Name: reasoner\nType: SOFTWARE", + "target": "Name: processed evidence\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "ift inspired selector reasoner workflow", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 9.0, + "description": "the workflow uses the reasoner for analysis after the selector", + "source_ids": [ + 157 + ], + "source": "Name: reasoner\nType: SOFTWARE", + "target": "Name: ift inspired selector reasoner workflow\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "retrieval workflows", + "relation_name": "", + "weight": 10.0, + "description": "the agent dynamically generates tailored retrieval workflows", + "source_ids": [ + 22 + ], + "source": "Name: retrieval workflows\nType: TASK_OR_PROBLEM", + "target": "Name: agent\nType: UNKNOWN" + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "retrieval workflows", + "relation_name": "", + "weight": 9.0, + "description": "the agent based retrieval approach configures optimal retrieval workflows", + "source_ids": [ + 26 + ], + "source": "Name: retrieval workflows\nType: TASK_OR_PROBLEM", + "target": "Name: agent based retrieval\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "selector operators", + "tgt_entity_name": "information scents", + "relation_name": "", + "weight": 9.0, + "description": "selector operators identify relevant patches by following information scents", + "source_ids": [ + 125 + ], + "source": "Name: information scents\nType: CONCEPT", + "target": "Name: selector operators\nType: SOFTWARE" + }, + { + "src_entity_name": "information scents", + "tgt_entity_name": "question", + "relation_name": "", + "weight": 8.0, + "description": "information scents include key entities found in a question", + "source_ids": [ + 125 + ], + "source": "Name: information scents\nType: CONCEPT", + "target": "Name: question\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "evaluation", + "tgt_entity_name": "state of the art baselines", + "relation_name": "", + "weight": 8.0, + "description": "the evaluation involves comparing bookrag to state of the art baselines", + "source_ids": [ + 151 + ], + "source": "Name: state of the art baselines\nType: PRODUCT", + "target": "Name: evaluation\nType: EVENT" + }, + { + "src_entity_name": "hierarchical tree", + "tgt_entity_name": "document layout blocks", + "relation_name": "", + "weight": 10.0, + "description": "the hierarchical tree is composed of document layout blocks", + "source_ids": [ + 25 + ], + "source": "Name: hierarchical tree\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: document layout blocks\nType: MATERIAL" + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "entity relations", + "relation_name": "", + "weight": 10.0, + "description": "the kg stores fine grained entity relations", + "source_ids": [ + 25 + ], + "source": "Name: kg\nType: SOFTWARE", + "target": "Name: entity relations\nType: CONCEPT" + }, + { + "src_entity_name": "offline indexing phase", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "kg is a form of structured index created during the offline indexing phase", + "source_ids": [ + 45 + ], + "source": "Name: kg\nType: SOFTWARE", + "target": "Name: offline indexing phase\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "extract links identified entities to the knowledge graph kg", + "source_ids": [ + 98 + ], + "source": "Name: kg\nType: SOFTWARE", + "target": "Name: extract\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "documents", + "relation_name": "", + "weight": 8.0, + "description": "the approach operates within documents to locate evidence", + "source_ids": [ + 26 + ], + "source": "Name: agent based retrieval\nType: TASK_OR_PROBLEM", + "target": "Name: documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "evidence", + "relation_name": "", + "weight": 10.0, + "description": "the goal of the approach is to locate highly relevant evidence within documents", + "source_ids": [ + 26 + ], + "source": "Name: agent based retrieval\nType: TASK_OR_PROBLEM", + "target": "Name: evidence\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 3", + "tgt_entity_name": "agent based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "figure 3 illustrates the workflow of agent based retrieval", + "source_ids": [ + 81 + ], + "source": "Name: agent based retrieval\nType: TASK_OR_PROBLEM", + "target": "Name: figure 3\nType: IMAGE" + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "three stage pipeline", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval follows a three stage pipeline to address queries", + "source_ids": [ + 81 + ], + "source": "Name: agent based retrieval\nType: TASK_OR_PROBLEM", + "target": "Name: three stage pipeline\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 9.0, + "description": "mmlongbench is used for complex document qa tasks", + "source_ids": [ + 141 + ], + "source": "Name: complex document qa tasks\nType: TASK_OR_PROBLEM", + "target": "Name: mmlongbench\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "m3docvqa", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 9.0, + "description": "m3docvqa is used for complex document qa tasks", + "source_ids": [ + 141 + ], + "source": "Name: complex document qa tasks\nType: TASK_OR_PROBLEM", + "target": "Name: m3docvqa\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "qasper", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 9.0, + "description": "qasper is used for complex document qa tasks", + "source_ids": [ + 141 + ], + "source": "Name: complex document qa tasks\nType: TASK_OR_PROBLEM", + "target": "Name: qasper\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 10.0, + "description": "table 5 focuses on solving complex document qa tasks", + "source_ids": [ + 153 + ], + "source": "Name: complex document qa tasks\nType: TASK_OR_PROBLEM", + "target": "Name: table 5\nType: TABLE" + }, + { + "src_entity_name": "performance comparison", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 8.0, + "description": "the performance comparison is aimed at solving complex document qa tasks", + "source_ids": [ + 153 + ], + "source": "Name: complex document qa tasks\nType: TASK_OR_PROBLEM", + "target": "Name: performance comparison\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "different methods", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 9.0, + "description": "different methods are used to solve complex document qa tasks", + "source_ids": [ + 153 + ], + "source": "Name: complex document qa tasks\nType: TASK_OR_PROBLEM", + "target": "Name: different methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "datasets", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 8.0, + "description": "datasets are used to evaluate methods for complex document qa tasks", + "source_ids": [ + 153 + ], + "source": "Name: complex document qa tasks\nType: TASK_OR_PROBLEM", + "target": "Name: datasets\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "extensive experiments", + "tgt_entity_name": "multiple benchmarks", + "relation_name": "", + "weight": 8.0, + "description": "extensive experiments were conducted on multiple benchmarks to validate results", + "source_ids": [ + 27 + ], + "source": "Name: extensive experiments\nType: EVENT", + "target": "Name: multiple benchmarks\nType: BENCHMARK" + }, + { + "src_entity_name": "multiple benchmarks", + "tgt_entity_name": "state of the art performance", + "relation_name": "", + "weight": 7.0, + "description": "the performance on multiple benchmarks showed state of the art results", + "source_ids": [ + 27 + ], + "source": "Name: multiple benchmarks\nType: BENCHMARK", + "target": "Name: state of the art performance\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "section 2", + "tgt_entity_name": "related work", + "relation_name": "", + "weight": 9.0, + "description": "section 2 is dedicated to reviewing related work", + "source_ids": [ + 29 + ], + "source": "Name: section 2\nType: SECTION_TITLE", + "target": "Name: related work\nType: UNKNOWN" + }, + { + "src_entity_name": "section 3", + "tgt_entity_name": "ift", + "relation_name": "", + "weight": 10.0, + "description": "section 3 introduces the ift method", + "source_ids": [ + 29 + ], + "source": "Name: section 3\nType: SECTION_TITLE", + "target": "Name: ift\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "structured execution", + "tgt_entity_name": "ift", + "relation_name": "", + "weight": 8.0, + "description": "structured execution includes the retrieval process under the principles of ift", + "source_ids": [ + 79 + ], + "source": "Name: ift\nType: METHOD_OR_TECHNIQUE", + "target": "Name: structured execution\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "ift", + "relation_name": "", + "weight": 9.0, + "description": "the scent filter based retrieval process aligns with ift", + "source_ids": [ + 125 + ], + "source": "Name: ift\nType: METHOD_OR_TECHNIQUE", + "target": "Name: scent filter based retrieval\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "section 6", + "tgt_entity_name": "experimental results", + "relation_name": "", + "weight": 10.0, + "description": "section 6 presents the experimental results and analysis", + "source_ids": [ + 29 + ], + "source": "Name: section 6\nType: SECTION_TITLE", + "target": "Name: experimental results\nType: UNKNOWN" + }, + { + "src_entity_name": "section 7", + "tgt_entity_name": "conclusion", + "relation_name": "", + "weight": 10.0, + "description": "section 7 concludes the paper", + "source_ids": [ + 29 + ], + "source": "Name: section 7\nType: SECTION_TITLE", + "target": "Name: conclusion\nType: UNKNOWN" + }, + { + "src_entity_name": "section 5", + "tgt_entity_name": "query classification", + "relation_name": "", + "weight": 10.0, + "description": "section 5 elaborates on query classification as part of agent based retrieval", + "source_ids": [ + 29 + ], + "source": "Name: section 5\nType: SECTION_TITLE", + "target": "Name: query classification\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "section 5", + "tgt_entity_name": "operators", + "relation_name": "", + "weight": 10.0, + "description": "section 5 describes the operators used in the structured execution of bookrag", + "source_ids": [ + 29 + ], + "source": "Name: section 5\nType: SECTION_TITLE", + "target": "Name: operators\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "section 5", + "tgt_entity_name": "structured execution", + "relation_name": "", + "weight": 10.0, + "description": "section 5 presents the structured execution of bookrag", + "source_ids": [ + 29 + ], + "source": "Name: section 5\nType: SECTION_TITLE", + "target": "Name: structured execution\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "classification plan", + "tgt_entity_name": "query classification", + "relation_name": "", + "weight": 9.0, + "description": "classification plan performs query classification to distinguish query types", + "source_ids": [ + 82 + ], + "source": "Name: query classification\nType: METHOD_OR_TECHNIQUE", + "target": "Name: classification plan\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "operators plan", + "relation_name": "", + "weight": 8.0, + "description": "the operators plan is generated based on the results of query classification", + "source_ids": [ + 82 + ], + "source": "Name: query classification\nType: METHOD_OR_TECHNIQUE", + "target": "Name: operators plan\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "retrieval-augmented generation", + "tgt_entity_name": "2 related work", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Retrieval-Augmented Generation' is a primary topic reviewed in section 2.", + "source_ids": [ + 30 + ], + "source": "Name: 2 related work\nType: SECTION_TITLE", + "target": "Name: retrieval-augmented generation\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "hierarchical document structures", + "tgt_entity_name": "2 related work", + "relation_name": "", + "weight": 10.0, + "description": "The challenge of 'Hierarchical Document Structures' is a primary topic reviewed in section 2.", + "source_ids": [ + 30 + ], + "source": "Name: 2 related work\nType: SECTION_TITLE", + "target": "Name: hierarchical document structures\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "rag approaches", + "relation_name": "", + "weight": 8.0, + "description": "both llm and rag approaches are reviewed together as related works in document analysis", + "source_ids": [ + 31 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: rag approaches\nType: TECHNOLOGY" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "document analysis", + "relation_name": "", + "weight": 9.0, + "description": "llm is used in the field of document analysis", + "source_ids": [ + 31 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: document analysis\nType: RESEARCH_FIELD" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "related works", + "relation_name": "", + "weight": 8.0, + "description": "llm is reviewed within the related works section", + "source_ids": [ + 31 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: related works\nType: SECTION_TITLE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "html", + "relation_name": "", + "weight": 9.0, + "description": "llms are used to convert html documents into structured formats", + "source_ids": [ + 32 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: html\nType: FILE_TYPE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "pdf", + "relation_name": "", + "weight": 9.0, + "description": "llms are used to convert pdf documents into structured formats", + "source_ids": [ + 32 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: pdf\nType: FILE_TYPE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "raw text", + "relation_name": "", + "weight": 9.0, + "description": "llms are used to convert raw text documents into structured formats", + "source_ids": [ + 32 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: raw text\nType: FILE_TYPE" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "relational tables", + "relation_name": "", + "weight": 9.0, + "description": "llms facilitate the conversion of unstructured documents into relational tables", + "source_ids": [ + 32 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: relational tables\nType: PRODUCT" + }, + { + "src_entity_name": "evaporate", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 10.0, + "description": "evaporate utilizes llms to synthesize extraction code", + "source_ids": [ + 32 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: evaporate\nType: SOFTWARE" + }, + { + "src_entity_name": "lotus", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 10.0, + "description": "lotus uses llm powered predicates to execute queries", + "source_ids": [ + 32 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: lotus\nType: SOFTWARE" + }, + { + "src_entity_name": "document pages", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "research proposes using llms to analyze document pages viewed as images", + "source_ids": [ + 32 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: document pages\nType: IMAGE" + }, + { + "src_entity_name": "gradient based er algorithm", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "the gradient based er algorithm isolates a set of entities which is subsequently processed by an llm for finer grained distinction", + "source_ids": [ + 74 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: gradient based er algorithm\nType: TECHNOLOGY" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "case a", + "relation_name": "", + "weight": 9.0, + "description": "the llm is used to differentiate the identified set from the no gradient scenario of case a", + "source_ids": [ + 74 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: case a\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "similar entities", + "relation_name": "", + "weight": 9.0, + "description": "the llm is utilized to distinguish between multiple similar entities identified within the set", + "source_ids": [ + 74 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: similar entities\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "decompose", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "decompose employs an llm to perform its function", + "source_ids": [ + 98 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: decompose\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "extract employs an llm to perform its function", + "source_ids": [ + 98 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: extract\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "formulator", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "formulators are defined as llm based operators", + "source_ids": [ + 98 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: formulator\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "s target", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "s target includes sections selected by the llm", + "source_ids": [ + 104 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: s target\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "treetraverse", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 7.0, + "description": "treetraverse uses an llm to navigate the document s tree structure", + "source_ids": [ + 148 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: treetraverse\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "accuracy inclusion based", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "accuracy inclusion based is utilized to account for the uncontrollable nature of llm generation", + "source_ids": [ + 227 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: accuracy inclusion based\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "large language model", + "relation_name": "", + "weight": 10.0, + "description": "llm is a direct abbreviation for large language model", + "source_ids": [ + 267 + ], + "source": "Name: llm\nType: TECHNOLOGY", + "target": "Name: large language model\nType: TECHNOLOGY" + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "document analysis", + "relation_name": "", + "weight": 9.0, + "description": "rag approaches are used in the field of document analysis", + "source_ids": [ + 31 + ], + "source": "Name: rag approaches\nType: TECHNOLOGY", + "target": "Name: document analysis\nType: RESEARCH_FIELD" + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "related works", + "relation_name": "", + "weight": 8.0, + "description": "rag approaches are reviewed within the related works section", + "source_ids": [ + 31 + ], + "source": "Name: rag approaches\nType: TECHNOLOGY", + "target": "Name: related works\nType: SECTION_TITLE" + }, + { + "src_entity_name": "ground truth block", + "tgt_entity_name": "pdf", + "relation_name": "", + "weight": 9.0, + "description": "a ground truth block can be lost due to pdf parsing failures", + "source_ids": [ + 236 + ], + "source": "Name: pdf\nType: FILE_TYPE", + "target": "Name: ground truth block\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "evaporate", + "tgt_entity_name": "web documents", + "relation_name": "", + "weight": 9.0, + "description": "evaporate converts semi structured web documents into structured databases", + "source_ids": [ + 32 + ], + "source": "Name: evaporate\nType: SOFTWARE", + "target": "Name: web documents\nType: PRODUCT" + }, + { + "src_entity_name": "evaporate", + "tgt_entity_name": "semi structured web documents", + "relation_name": "", + "weight": 9.0, + "description": "evaporate converts semi structured web documents", + "source_ids": [ + 32 + ], + "source": "Name: evaporate\nType: SOFTWARE", + "target": "Name: semi structured web documents\nType: PRODUCT" + }, + { + "src_entity_name": "evaporate", + "tgt_entity_name": "structured databases", + "relation_name": "", + "weight": 9.0, + "description": "evaporate converts documents into structured databases", + "source_ids": [ + 32 + ], + "source": "Name: evaporate\nType: SOFTWARE", + "target": "Name: structured databases\nType: PRODUCT" + }, + { + "src_entity_name": "evaporate", + "tgt_entity_name": "manual annotation", + "relation_name": "", + "weight": 8.0, + "description": "evaporate avoids the need for heavy manual annotation", + "source_ids": [ + 32 + ], + "source": "Name: evaporate\nType: SOFTWARE", + "target": "Name: manual annotation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "lotus", + "tgt_entity_name": "sql", + "relation_name": "", + "weight": 8.0, + "description": "lotus allows users to execute sql like queries", + "source_ids": [ + 32 + ], + "source": "Name: lotus\nType: SOFTWARE", + "target": "Name: sql\nType: PROGRAMMING_LANGUAGE" + }, + { + "src_entity_name": "lotus", + "tgt_entity_name": "unstructured text corpora", + "relation_name": "", + "weight": 9.0, + "description": "lotus allows queries to be executed over unstructured text corpora", + "source_ids": [ + 32 + ], + "source": "Name: lotus\nType: SOFTWARE", + "target": "Name: unstructured text corpora\nType: UNKNOWN" + }, + { + "src_entity_name": "lotus", + "tgt_entity_name": "semantic operators", + "relation_name": "", + "weight": 10.0, + "description": "lotus extends the relational model with semantic operators", + "source_ids": [ + 32 + ], + "source": "Name: lotus\nType: SOFTWARE", + "target": "Name: semantic operators\nType: TECHNOLOGY" + }, + { + "src_entity_name": "lotus", + "tgt_entity_name": "predicates", + "relation_name": "", + "weight": 9.0, + "description": "lotus uses llm powered predicates for querying", + "source_ids": [ + 32 + ], + "source": "Name: lotus\nType: SOFTWARE", + "target": "Name: predicates\nType: TECHNOLOGY" + }, + { + "src_entity_name": "document pages", + "tgt_entity_name": "layout", + "relation_name": "", + "weight": 9.0, + "description": "document pages are viewed as images to preserve critical layout information", + "source_ids": [ + 32 + ], + "source": "Name: document pages\nType: IMAGE", + "target": "Name: layout\nType: CONCEPT" + }, + { + "src_entity_name": "document pages", + "tgt_entity_name": "visual information", + "relation_name": "", + "weight": 9.0, + "description": "document pages are viewed as images to preserve critical visual information", + "source_ids": [ + 32 + ], + "source": "Name: document pages\nType: IMAGE", + "target": "Name: visual information\nType: CONCEPT" + }, + { + "src_entity_name": "predicates", + "tgt_entity_name": "filter", + "relation_name": "", + "weight": 8.0, + "description": "filter is an example of a predicate used in lotus", + "source_ids": [ + 32 + ], + "source": "Name: predicates\nType: TECHNOLOGY", + "target": "Name: filter\nType: TECHNOLOGY" + }, + { + "src_entity_name": "predicates", + "tgt_entity_name": "join", + "relation_name": "", + "weight": 8.0, + "description": "join is an example of a predicate used in lotus", + "source_ids": [ + 32 + ], + "source": "Name: predicates\nType: TECHNOLOGY", + "target": "Name: join\nType: TECHNOLOGY" + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "open ended question answering", + "relation_name": "", + "weight": 9.0, + "description": "rag approaches excel in open ended question answering", + "source_ids": [ + 33 + ], + "source": "Name: rag approaches\nType: METHOD_OR_TECHNIQUE", + "target": "Name: open ended question answering\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "programming context", + "relation_name": "", + "weight": 9.0, + "description": "rag approaches excel in programming context tasks", + "source_ids": [ + 33 + ], + "source": "Name: rag approaches\nType: METHOD_OR_TECHNIQUE", + "target": "Name: programming context\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "sql rewrite", + "relation_name": "", + "weight": 9.0, + "description": "rag approaches excel in sql rewrite tasks", + "source_ids": [ + 33 + ], + "source": "Name: rag approaches\nType: METHOD_OR_TECHNIQUE", + "target": "Name: sql rewrite\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "data cleaning", + "relation_name": "", + "weight": 9.0, + "description": "rag approaches excel in data cleaning tasks", + "source_ids": [ + 33 + ], + "source": "Name: rag approaches\nType: METHOD_OR_TECHNIQUE", + "target": "Name: data cleaning\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "graph structures", + "relation_name": "", + "weight": 9.0, + "description": "many rag approaches have adopted graph structures to organize information", + "source_ids": [ + 33 + ], + "source": "Name: rag approaches\nType: METHOD_OR_TECHNIQUE", + "target": "Name: graph structures\nType: TECHNOLOGY" + }, + { + "src_entity_name": "naive rag technique", + "tgt_entity_name": "external knowledge bases", + "relation_name": "", + "weight": 8.0, + "description": "the naive rag technique retrieves query relevant contexts from external knowledge bases", + "source_ids": [ + 33 + ], + "source": "Name: naive rag technique\nType: METHOD_OR_TECHNIQUE", + "target": "Name: external knowledge bases\nType: TECHNOLOGY" + }, + { + "src_entity_name": "graph structures", + "tgt_entity_name": "documents", + "relation_name": "", + "weight": 8.0, + "description": "graph structures organize information and relationships within documents", + "source_ids": [ + 33 + ], + "source": "Name: graph structures\nType: TECHNOLOGY", + "target": "Name: documents\nType: UNKNOWN" + }, + { + "src_entity_name": "agentic rag paradigm", + "tgt_entity_name": "autonomous agents", + "relation_name": "", + "weight": 10.0, + "description": "the agentic rag paradigm employs autonomous agents to orchestrate the pipeline", + "source_ids": [ + 33 + ], + "source": "Name: agentic rag paradigm\nType: METHOD_OR_TECHNIQUE", + "target": "Name: autonomous agents\nType: TECHNOLOGY" + }, + { + "src_entity_name": "agentic rag paradigm", + "tgt_entity_name": "rag pipeline", + "relation_name": "", + "weight": 10.0, + "description": "the agentic rag paradigm dynamically orchestrates and refines the rag pipeline", + "source_ids": [ + 33 + ], + "source": "Name: agentic rag paradigm\nType: METHOD_OR_TECHNIQUE", + "target": "Name: rag pipeline\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agentic rag paradigm", + "tgt_entity_name": "reasoning robustness", + "relation_name": "", + "weight": 9.0, + "description": "the agentic rag paradigm significantly boosts reasoning robustness", + "source_ids": [ + 33 + ], + "source": "Name: agentic rag paradigm\nType: METHOD_OR_TECHNIQUE", + "target": "Name: reasoning robustness\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "agentic rag paradigm", + "tgt_entity_name": "generation fidelity", + "relation_name": "", + "weight": 9.0, + "description": "the agentic rag paradigm significantly boosts generation fidelity", + "source_ids": [ + 33 + ], + "source": "Name: agentic rag paradigm\nType: METHOD_OR_TECHNIQUE", + "target": "Name: generation fidelity\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "rag systems", + "tgt_entity_name": "3.3 rag workflow", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'RAG systems' is the primary topic and subject matter of section 3.3.", + "source_ids": [ + 44 + ], + "source": "Name: rag systems\nType: TECHNOLOGY", + "target": "Name: 3.3 rag workflow\nType: SECTION_TITLE" + }, + { + "src_entity_name": "research problem", + "tgt_entity_name": "general workflow", + "relation_name": "", + "weight": 7.0, + "description": "both the research problem and the general workflow are discussed within the same section", + "source_ids": [ + 35 + ], + "source": "Name: research problem\nType: TASK_OR_PROBLEM", + "target": "Name: general workflow\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "method s", + "tgt_entity_name": "user query", + "relation_name": "", + "weight": 10.0, + "description": "method s maps a user query to a final answer", + "source_ids": [ + 37 + ], + "source": "Name: user query\nType: TASK_OR_PROBLEM", + "target": "Name: method s\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "user query", + "tgt_entity_name": "q", + "relation_name": "", + "weight": 10.0, + "description": "q is the variable symbol for the user query", + "source_ids": [ + 37 + ], + "source": "Name: user query\nType: TASK_OR_PROBLEM", + "target": "Name: q\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "online retrieval phase", + "tgt_entity_name": "user query", + "relation_name": "", + "weight": 10.0, + "description": "the online retrieval phase uses the user query to retrieve relevant components", + "source_ids": [ + 45 + ], + "source": "Name: user query\nType: TASK_OR_PROBLEM", + "target": "Name: online retrieval phase\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "answer", + "tgt_entity_name": "evidence blocks", + "relation_name": "", + "weight": 9.0, + "description": "an answer is ideally grounded in a specific set of evidence blocks", + "source_ids": [ + 37 + ], + "source": "Name: answer\nType: TASK_OR_PROBLEM", + "target": "Name: evidence blocks\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "method s", + "tgt_entity_name": "answer", + "relation_name": "", + "weight": 10.0, + "description": "method s produces the final answer", + "source_ids": [ + 37 + ], + "source": "Name: answer\nType: TASK_OR_PROBLEM", + "target": "Name: method s\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "answer", + "tgt_entity_name": "a", + "relation_name": "", + "weight": 10.0, + "description": "a is the variable symbol for the answer", + "source_ids": [ + 37 + ], + "source": "Name: answer\nType: TASK_OR_PROBLEM", + "target": "Name: a\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "cref='#/texts/89'", + "tgt_entity_name": "answer", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/89' related to Answer", + "source_ids": [ + 84 + ], + "source": "Name: answer\nType: TASK_OR_PROBLEM", + "target": "Name: cref='#/texts/89'\nType: IMAGE" + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "answer", + "relation_name": "", + "weight": 10.0, + "description": "the synthesizer generates the answer", + "source_ids": [ + 124 + ], + "source": "Name: answer\nType: TASK_OR_PROBLEM", + "target": "Name: synthesizer\nType: SOFTWARE" + }, + { + "src_entity_name": "processed evidence", + "tgt_entity_name": "answer", + "relation_name": "", + "weight": 8.0, + "description": "processed evidence is used to generate the answer", + "source_ids": [ + 124 + ], + "source": "Name: answer\nType: TASK_OR_PROBLEM", + "target": "Name: processed evidence\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "evidence blocks", + "tgt_entity_name": "e", + "relation_name": "", + "weight": 10.0, + "description": "e is the variable symbol for the set of evidence blocks", + "source_ids": [ + 37 + ], + "source": "Name: evidence blocks\nType: DATASET_OR_CORPUS", + "target": "Name: e\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "method s", + "tgt_entity_name": "equation 1", + "relation_name": "", + "weight": 10.0, + "description": "method s is mathematically defined by equation 1", + "source_ids": [ + 37 + ], + "source": "Name: method s\nType: METHOD_OR_TECHNIQUE", + "target": "Name: equation 1\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "pages", + "tgt_entity_name": "p", + "relation_name": "", + "weight": 10.0, + "description": "p is the variable symbol for pages", + "source_ids": [ + 37 + ], + "source": "Name: pages\nType: MEASUREMENT", + "target": "Name: p\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "text segment", + "relation_name": "", + "weight": 8.0, + "description": "a text segment is an example of a content block", + "source_ids": [ + 37 + ], + "source": "Name: content blocks\nType: DATASET_OR_CORPUS", + "target": "Name: text segment\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "section header", + "relation_name": "", + "weight": 8.0, + "description": "a section header is an example of a content block", + "source_ids": [ + 37 + ], + "source": "Name: content blocks\nType: DATASET_OR_CORPUS", + "target": "Name: section header\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 8.0, + "description": "a table is an example of a content block", + "source_ids": [ + 37 + ], + "source": "Name: content blocks\nType: DATASET_OR_CORPUS", + "target": "Name: table\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "image", + "relation_name": "", + "weight": 8.0, + "description": "an image is an example of a content block", + "source_ids": [ + 37 + ], + "source": "Name: content blocks\nType: DATASET_OR_CORPUS", + "target": "Name: image\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "logical chapter hierarchy", + "relation_name": "", + "weight": 9.0, + "description": "content blocks are organized within a logical chapter hierarchy", + "source_ids": [ + 37 + ], + "source": "Name: content blocks\nType: DATASET_OR_CORPUS", + "target": "Name: logical chapter hierarchy\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "b", + "relation_name": "", + "weight": 10.0, + "description": "b is the variable symbol for the sequence of content blocks", + "source_ids": [ + 37 + ], + "source": "Name: content blocks\nType: DATASET_OR_CORPUS", + "target": "Name: b\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "4.2.1 layout parsing", + "relation_name": "", + "weight": 10.0, + "description": "Content Blocks represent the output entities identified and organized within section 4.2.1.", + "source_ids": [ + 55 + ], + "source": "Name: content blocks\nType: DATASET_OR_CORPUS", + "target": "Name: 4.2.1 layout parsing\nType: SECTION_TITLE" + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "n", + "relation_name": "", + "weight": 10.0, + "description": "n represents the set of nodes contained within the tree structure", + "source_ids": [ + 51 + ], + "source": "Name: n\nType: PARAMETER_OR_VARIABLE", + "target": "Name: tree structure\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "n", + "relation_name": "", + "weight": 9.0, + "description": "the synthesizer operator takes n as an input parameter to generate the answer", + "source_ids": [ + 129 + ], + "source": "Name: n\nType: PARAMETER_OR_VARIABLE", + "target": "Name: synthesizer\nType: SOFTWARE" + }, + { + "src_entity_name": "n", + "tgt_entity_name": "15", + "relation_name": "", + "weight": 8.0, + "description": "n is a component of the equation labeled 15", + "source_ids": [ + 129 + ], + "source": "Name: n\nType: PARAMETER_OR_VARIABLE", + "target": "Name: 15\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "m", + "relation_name": "", + "weight": 10.0, + "description": "m is a defined component within the bookindex structure", + "source_ids": [ + 88 + ], + "source": "Name: m\nType: PARAMETER_OR_VARIABLE", + "target": "Name: bookindex\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "", + "tgt_entity_name": "p", + "relation_name": "", + "weight": 10.0, + "description": "maps entities to the power set of nodes p", + "source_ids": [ + 51 + ], + "source": "Name: p\nType: PARAMETER_OR_VARIABLE", + "target": "Name: \nType: UNKNOWN" + }, + { + "src_entity_name": "p", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 10.0, + "description": "p precision is a component used in the calculation of the f1 score", + "source_ids": [ + 231 + ], + "source": "Name: p\nType: PARAMETER_OR_VARIABLE", + "target": "Name: f1\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "q", + "relation_name": "", + "weight": 9.0, + "description": "the synthesizer operator takes q as an input parameter to generate the answer", + "source_ids": [ + 129 + ], + "source": "Name: q\nType: PARAMETER_OR_VARIABLE", + "target": "Name: synthesizer\nType: SOFTWARE" + }, + { + "src_entity_name": "q", + "tgt_entity_name": "15", + "relation_name": "", + "weight": 8.0, + "description": "q is a component of the equation labeled 15", + "source_ids": [ + 129 + ], + "source": "Name: q\nType: PARAMETER_OR_VARIABLE", + "target": "Name: 15\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "a", + "relation_name": "", + "weight": 9.0, + "description": "the synthesizer operator produces a as its output", + "source_ids": [ + 129 + ], + "source": "Name: a\nType: PARAMETER_OR_VARIABLE", + "target": "Name: synthesizer\nType: SOFTWARE" + }, + { + "src_entity_name": "a", + "tgt_entity_name": "15", + "relation_name": "", + "weight": 8.0, + "description": "a is the subject of the equation labeled 15", + "source_ids": [ + 129 + ], + "source": "Name: a\nType: PARAMETER_OR_VARIABLE", + "target": "Name: 15\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "proceedings of the acm on management of data", + "tgt_entity_name": "3", + "relation_name": "", + "weight": 10.0, + "description": "the publication issue is 3", + "source_ids": [ + 199 + ], + "source": "Name: 3\nType: MEASUREMENT", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "page", + "tgt_entity_name": "3", + "relation_name": "", + "weight": 7.0, + "description": "3 is part of the page range", + "source_ids": [ + 258 + ], + "source": "Name: 3\nType: MEASUREMENT", + "target": "Name: page\nType: MEASUREMENT" + }, + { + "src_entity_name": "s", + "tgt_entity_name": "d", + "relation_name": "", + "weight": 9.0, + "description": "s must navigate the logical hierarchy of d to synthesize the response", + "source_ids": [ + 40 + ], + "source": "Name: s\nType: PERSON", + "target": "Name: d\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "information scent", + "tgt_entity_name": "handbooks", + "relation_name": "", + "weight": 7.0, + "description": "information scent cues like keywords or icons are found within sections of handbooks which act as information patches", + "source_ids": [ + 42 + ], + "source": "Name: information scent\nType: CONCEPT", + "target": "Name: handbooks\nType: PRODUCT" + }, + { + "src_entity_name": "information scent", + "tgt_entity_name": "keywords", + "relation_name": "", + "weight": 10.0, + "description": "keywords are explicitly listed as examples of information scent", + "source_ids": [ + 42 + ], + "source": "Name: information scent\nType: CONCEPT", + "target": "Name: keywords\nType: CONCEPT" + }, + { + "src_entity_name": "information scent", + "tgt_entity_name": "icons", + "relation_name": "", + "weight": 10.0, + "description": "icons are explicitly listed as examples of information scent", + "source_ids": [ + 42 + ], + "source": "Name: information scent\nType: CONCEPT", + "target": "Name: icons\nType: CONCEPT" + }, + { + "src_entity_name": "key terms", + "tgt_entity_name": "information scent", + "relation_name": "", + "weight": 10.0, + "description": "key terms act as information scent", + "source_ids": [ + 43 + ], + "source": "Name: information scent\nType: CONCEPT", + "target": "Name: key terms\nType: CONCEPT" + }, + { + "src_entity_name": "information scent", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 9.0, + "description": "information scent guides experts to navigate towards information patches", + "source_ids": [ + 43 + ], + "source": "Name: information scent\nType: CONCEPT", + "target": "Name: information patches\nType: CONCEPT" + }, + { + "src_entity_name": "knowledge graph", + "tgt_entity_name": "information scent", + "relation_name": "", + "weight": 9.0, + "description": "the entities and relations in the knowledge graph act as information scent", + "source_ids": [ + 51 + ], + "source": "Name: information scent\nType: CONCEPT", + "target": "Name: knowledge graph\nType: SOFTWARE" + }, + { + "src_entity_name": "information scent", + "tgt_entity_name": "navigation", + "relation_name": "", + "weight": 8.0, + "description": "information scent guides navigation between and within information patches", + "source_ids": [ + 51 + ], + "source": "Name: information scent\nType: CONCEPT", + "target": "Name: navigation\nType: UNKNOWN" + }, + { + "src_entity_name": "information patches", + "tgt_entity_name": "sections", + "relation_name": "", + "weight": 10.0, + "description": "sections in handbooks are explicitly listed as examples of information patches", + "source_ids": [ + 42 + ], + "source": "Name: information patches\nType: CONCEPT", + "target": "Name: sections\nType: CONCEPT" + }, + { + "src_entity_name": "experts", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 9.0, + "description": "experts navigate to and analyze content within information patches", + "source_ids": [ + 43 + ], + "source": "Name: information patches\nType: CONCEPT", + "target": "Name: experts\nType: PERSON" + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 9.0, + "description": "the hierarchical tree nodes in the tree structure serve as information patches", + "source_ids": [ + 51 + ], + "source": "Name: information patches\nType: CONCEPT", + "target": "Name: tree structure\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "experts", + "tgt_entity_name": "large technical handbook", + "relation_name": "", + "weight": 10.0, + "description": "experts seek a solution within the large technical handbook", + "source_ids": [ + 43 + ], + "source": "Name: experts\nType: PERSON", + "target": "Name: large technical handbook\nType: BOOK" + }, + { + "src_entity_name": "experts", + "tgt_entity_name": "key terms", + "relation_name": "", + "weight": 9.0, + "description": "experts extract key terms from the handbook", + "source_ids": [ + 43 + ], + "source": "Name: experts\nType: PERSON", + "target": "Name: key terms\nType: CONCEPT" + }, + { + "src_entity_name": "experts", + "tgt_entity_name": "final answer", + "relation_name": "", + "weight": 10.0, + "description": "experts formulate a final answer based on the analysis of information patches", + "source_ids": [ + 43 + ], + "source": "Name: experts\nType: PERSON", + "target": "Name: final answer\nType: CONCEPT" + }, + { + "src_entity_name": "experts", + "tgt_entity_name": "problem", + "relation_name": "", + "weight": 10.0, + "description": "experts are seeking a solution to the specific problem", + "source_ids": [ + 43 + ], + "source": "Name: experts\nType: PERSON", + "target": "Name: problem\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "experts", + "tgt_entity_name": "diverse content", + "relation_name": "", + "weight": 9.0, + "description": "experts analyze the diverse content within the information patches", + "source_ids": [ + 43 + ], + "source": "Name: experts\nType: PERSON", + "target": "Name: diverse content\nType: CONCEPT" + }, + { + "src_entity_name": "precise knowledge", + "tgt_entity_name": "final answer", + "relation_name": "", + "weight": 10.0, + "description": "precise knowledge is used to formulate the final answer", + "source_ids": [ + 43 + ], + "source": "Name: final answer\nType: CONCEPT", + "target": "Name: precise knowledge\nType: CONCEPT" + }, + { + "src_entity_name": "diverse content", + "tgt_entity_name": "precise knowledge", + "relation_name": "", + "weight": 9.0, + "description": "experts extract precise knowledge from the diverse content", + "source_ids": [ + 43 + ], + "source": "Name: diverse content\nType: CONCEPT", + "target": "Name: precise knowledge\nType: CONCEPT" + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "offline indexing phase", + "relation_name": "", + "weight": 10.0, + "description": "retrieval augmented generation systems operate in the offline indexing phase as their first step", + "source_ids": [ + 45 + ], + "source": "Name: retrieval augmented generation\nType: TASK_OR_PROBLEM", + "target": "Name: offline indexing phase\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "online retrieval phase", + "relation_name": "", + "weight": 10.0, + "description": "retrieval augmented generation systems operate in the online retrieval phase as their second step", + "source_ids": [ + 45 + ], + "source": "Name: retrieval augmented generation\nType: TASK_OR_PROBLEM", + "target": "Name: online retrieval phase\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "document s native tree topology", + "relation_name": "", + "weight": 8.0, + "description": "the proposed approach for retrieval augmented generation seeks to integrate retrieval structures with the document s native tree topology", + "source_ids": [ + 45 + ], + "source": "Name: retrieval augmented generation\nType: TASK_OR_PROBLEM", + "target": "Name: document s native tree topology\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "the retrieval augmented generation approach seeks to integrate structures with the document s native topology", + "source_ids": [ + 45 + ], + "source": "Name: retrieval augmented generation\nType: TASK_OR_PROBLEM", + "target": "Name: document\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "offline indexing phase", + "tgt_entity_name": "vector databases", + "relation_name": "", + "weight": 9.0, + "description": "vector databases are a form of structured index created during the offline indexing phase", + "source_ids": [ + 45 + ], + "source": "Name: offline indexing phase\nType: TASK_OR_PROBLEM", + "target": "Name: vector databases\nType: SOFTWARE" + }, + { + "src_entity_name": "offline indexing phase", + "tgt_entity_name": "unstructured corpus data", + "relation_name": "", + "weight": 10.0, + "description": "the offline indexing phase organizes unstructured corpus data into a structured index", + "source_ids": [ + 45 + ], + "source": "Name: offline indexing phase\nType: TASK_OR_PROBLEM", + "target": "Name: unstructured corpus data\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "online retrieval phase", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "the online retrieval phase informs the llm s generation with retrieved components", + "source_ids": [ + 45 + ], + "source": "Name: online retrieval phase\nType: TASK_OR_PROBLEM", + "target": "Name: llm\nType: SOFTWARE" + }, + { + "src_entity_name": "online retrieval phase", + "tgt_entity_name": "text chunks", + "relation_name": "", + "weight": 9.0, + "description": "text chunks are retrieved as relevant components during the online retrieval phase", + "source_ids": [ + 45 + ], + "source": "Name: online retrieval phase\nType: TASK_OR_PROBLEM", + "target": "Name: text chunks\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "online retrieval phase", + "tgt_entity_name": "subgraphs", + "relation_name": "", + "weight": 9.0, + "description": "subgraphs are retrieved as relevant components during the online retrieval phase", + "source_ids": [ + 45 + ], + "source": "Name: online retrieval phase\nType: TASK_OR_PROBLEM", + "target": "Name: subgraphs\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 9.0, + "description": "The LLM is the method/tool utilized for extracting data from text-only nodes within this section.", + "source_ids": [ + 63 + ], + "source": "Name: llm\nType: SOFTWARE", + "target": "Name: 4.3.1 kg construction\nType: SECTION_TITLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "algorithm 1 uses an llm to select v sel if multiple aliases are identified", + "source_ids": [ + 75 + ], + "source": "Name: llm\nType: SOFTWARE", + "target": "Name: algorithm 1\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "select by entity", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "select by entity targets contiguous segments within the document", + "source_ids": [ + 104 + ], + "source": "Name: document\nType: TASK_OR_PROBLEM", + "target": "Name: select by entity\nType: TECHNOLOGY" + }, + { + "src_entity_name": "select by section", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "select by section targets contiguous segments within the document", + "source_ids": [ + 104 + ], + "source": "Name: document\nType: TASK_OR_PROBLEM", + "target": "Name: select by section\nType: TECHNOLOGY" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "4 bookindex", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'BookIndex' is the primary subject defined and detailed in section 4.", + "source_ids": [ + 46 + ], + "source": "Name: 4 bookindex\nType: SECTION_TITLE", + "target": "Name: bookindex\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "hierarchical tree", + "tgt_entity_name": "4 bookindex", + "relation_name": "", + "weight": 9.5, + "description": "The 'hierarchical tree' is a core component and technique described within section 4 as part of the BookIndex implementation.", + "source_ids": [ + 46 + ], + "source": "Name: 4 bookindex\nType: SECTION_TITLE", + "target": "Name: hierarchical tree\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "graph", + "tgt_entity_name": "4 bookindex", + "relation_name": "", + "weight": 9.5, + "description": "The use of a 'graph' to capture entity relations is a key technical detail explained in section 4.", + "source_ids": [ + 46 + ], + "source": "Name: 4 bookindex\nType: SECTION_TITLE", + "target": "Name: graph\nType: TECHNOLOGY" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "4.1 overview of bookindex", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'BookIndex' is the primary topic defined and introduced in section 4.1.", + "source_ids": [ + 50 + ], + "source": "Name: bookindex\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: 4.1 overview of bookindex\nType: SECTION_TITLE" + }, + { + "src_entity_name": "tree construction", + "tgt_entity_name": "hierarchical nodes", + "relation_name": "", + "weight": 9.0, + "description": "tree construction parses document layout to establish hierarchical nodes", + "source_ids": [ + 47 + ], + "source": "Name: tree construction\nType: METHOD_OR_TECHNIQUE", + "target": "Name: hierarchical nodes\nType: CONCEPT" + }, + { + "src_entity_name": "tree construction", + "tgt_entity_name": "4.1 overview of bookindex", + "relation_name": "", + "weight": 9.5, + "description": "The method 'Tree Construction' is a key component of the overview provided in section 4.1.", + "source_ids": [ + 50 + ], + "source": "Name: tree construction\nType: METHOD_OR_TECHNIQUE", + "target": "Name: 4.1 overview of bookindex\nType: SECTION_TITLE" + }, + { + "src_entity_name": "tree construction", + "tgt_entity_name": "4.2 tree construction", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Tree Construction' is the primary topic and methodology detailed in section 4.2.", + "source_ids": [ + 53 + ], + "source": "Name: tree construction\nType: METHOD_OR_TECHNIQUE", + "target": "Name: 4.2 tree construction\nType: SECTION_TITLE" + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "fine grained entity knowledge", + "relation_name": "", + "weight": 9.0, + "description": "graph construction extracts fine grained entity knowledge from tree nodes", + "source_ids": [ + 47 + ], + "source": "Name: graph construction\nType: METHOD_OR_TECHNIQUE", + "target": "Name: fine grained entity knowledge\nType: CONCEPT" + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "hierarchical nodes", + "relation_name": "", + "weight": 8.0, + "description": "graph construction operates on the tree nodes established by tree construction to extract knowledge", + "source_ids": [ + 47 + ], + "source": "Name: graph construction\nType: METHOD_OR_TECHNIQUE", + "target": "Name: hierarchical nodes\nType: CONCEPT" + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "4.1 overview of bookindex", + "relation_name": "", + "weight": 9.5, + "description": "The method 'Graph Construction' is a key component of the overview provided in section 4.1.", + "source_ids": [ + 50 + ], + "source": "Name: graph construction\nType: METHOD_OR_TECHNIQUE", + "target": "Name: 4.1 overview of bookindex\nType: SECTION_TITLE" + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "4.3 graph construction", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Graph Construction' is the primary topic and subject matter of section 4.3.", + "source_ids": [ + 61 + ], + "source": "Name: graph construction\nType: METHOD_OR_TECHNIQUE", + "target": "Name: 4.3 graph construction\nType: SECTION_TITLE" + }, + { + "src_entity_name": "semantic entities", + "tgt_entity_name": "logical hierarchy", + "relation_name": "", + "weight": 8.0, + "description": "semantic entities are grounded within the document s logical hierarchy", + "source_ids": [ + 52 + ], + "source": "Name: logical hierarchy\nType: CONCEPT", + "target": "Name: semantic entities\nType: CONCEPT" + }, + { + "src_entity_name": "layout parsing phase", + "tgt_entity_name": "figure 2", + "relation_name": "", + "weight": 9.0, + "description": "figure 2 serves as an example for the layout parsing phase", + "source_ids": [ + 59 + ], + "source": "Name: figure 2\nType: IMAGE", + "target": "Name: layout parsing phase\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookindex construction process", + "tgt_entity_name": "tree construction", + "relation_name": "", + "weight": 10.0, + "description": "the bookindex construction process includes tree construction as a phase", + "source_ids": [ + 48 + ], + "source": "Name: bookindex construction process\nType: TASK_OR_PROBLEM", + "target": "Name: tree construction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookindex construction process", + "tgt_entity_name": "graph construction", + "relation_name": "", + "weight": 10.0, + "description": "the bookindex construction process includes graph construction as a phase", + "source_ids": [ + 48 + ], + "source": "Name: bookindex construction process\nType: TASK_OR_PROBLEM", + "target": "Name: graph construction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "tree construction", + "tgt_entity_name": "layout parsing", + "relation_name": "", + "weight": 9.0, + "description": "tree construction is derived from layout parsing", + "source_ids": [ + 48 + ], + "source": "Name: tree construction\nType: TASK_OR_PROBLEM", + "target": "Name: layout parsing\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "tree construction", + "tgt_entity_name": "section filtering", + "relation_name": "", + "weight": 9.0, + "description": "tree construction is derived from section filtering", + "source_ids": [ + 48 + ], + "source": "Name: tree construction\nType: TASK_OR_PROBLEM", + "target": "Name: section filtering\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "tree construction", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Tree Construction", + "source_ids": [ + 49 + ], + "source": "Name: tree construction\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "layout parsing", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Layout Parsing", + "source_ids": [ + 49 + ], + "source": "Name: layout parsing\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "layout parsing", + "relation_name": "", + "weight": 9.0, + "description": "section filtering processes the output of layout parsing to identify hierarchical structure", + "source_ids": [ + 57 + ], + "source": "Name: layout parsing\nType: METHOD_OR_TECHNIQUE", + "target": "Name: section filtering\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "layout parsing", + "tgt_entity_name": "title", + "relation_name": "", + "weight": 8.0, + "description": "layout parsing identifies blocks as title", + "source_ids": [ + 57 + ], + "source": "Name: layout parsing\nType: METHOD_OR_TECHNIQUE", + "target": "Name: title\nType: SECTION_TITLE" + }, + { + "src_entity_name": "layout parsing", + "tgt_entity_name": "b title", + "relation_name": "", + "weight": 8.0, + "description": "layout parsing identifies blocks as title forming the subset b title", + "source_ids": [ + 57 + ], + "source": "Name: layout parsing\nType: METHOD_OR_TECHNIQUE", + "target": "Name: b title\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "section filtering", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Section Filtering", + "source_ids": [ + 49 + ], + "source": "Name: section filtering\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "kg construction", + "relation_name": "", + "weight": 9.0, + "description": "graph construction involves kg construction", + "source_ids": [ + 48 + ], + "source": "Name: graph construction\nType: TASK_OR_PROBLEM", + "target": "Name: kg construction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "gradient based entity resolution", + "relation_name": "", + "weight": 9.0, + "description": "graph construction involves gradient based entity resolution", + "source_ids": [ + 48 + ], + "source": "Name: graph construction\nType: TASK_OR_PROBLEM", + "target": "Name: gradient based entity resolution\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "graph construction", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Graph Construction", + "source_ids": [ + 49 + ], + "source": "Name: graph construction\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "gradient based entity resolution", + "relation_name": "", + "weight": 10.0, + "description": "algorithm 1 is defined as a gradient based entity resolution method", + "source_ids": [ + 69 + ], + "source": "Name: gradient based entity resolution\nType: METHOD_OR_TECHNIQUE", + "target": "Name: algorithm 1\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "bookindex construction", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to BookIndex Construction", + "source_ids": [ + 49 + ], + "source": "Name: bookindex construction\nType: IMAGE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "title: method", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Title: Method", + "source_ids": [ + 49 + ], + "source": "Name: title: method\nType: SECTION_TITLE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "title: experiment", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Title: Experiment", + "source_ids": [ + 49 + ], + "source": "Name: title: experiment\nType: SECTION_TITLE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "title: moe layer", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Title: MOE Layer", + "source_ids": [ + 49 + ], + "source": "Name: title: moe layer\nType: SECTION_TITLE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "level: 2 type: section", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Level: 2 Type: Section", + "source_ids": [ + 49 + ], + "source": "Name: level: 2 type: section\nType: PARAMETER_OR_VARIABLE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "level: none type: text", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Level: None Type: Text", + "source_ids": [ + 49 + ], + "source": "Name: level: none type: text\nType: PARAMETER_OR_VARIABLE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "tree node", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Tree Node", + "source_ids": [ + 49 + ], + "source": "Name: tree node\nType: HARDWARE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "gt-link", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to GT-Link", + "source_ids": [ + 49 + ], + "source": "Name: gt-link\nType: SOFTWARE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "relation", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Relation", + "source_ids": [ + 49 + ], + "source": "Name: relation\nType: DATASET_OR_CORPUS", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "kg construction", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to KG Construction", + "source_ids": [ + 49 + ], + "source": "Name: kg construction\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "gradient-based entity resolution", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Gradient-based Entity Resolution", + "source_ids": [ + 49 + ], + "source": "Name: gradient-based entity resolution\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "gradient-based entity resolution", + "tgt_entity_name": "4.3 graph construction", + "relation_name": "", + "weight": 9.5, + "description": "The method 'Gradient-based Entity Resolution' is a key component and technique detailed within section 4.3.", + "source_ids": [ + 61 + ], + "source": "Name: gradient-based entity resolution\nType: METHOD_OR_TECHNIQUE", + "target": "Name: 4.3 graph construction\nType: SECTION_TITLE" + }, + { + "src_entity_name": "gradient-based entity resolution", + "tgt_entity_name": "4.3.2 gradient-based entity resolution", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Gradient-based Entity Resolution' is the primary methodological topic of section 4.3.2.", + "source_ids": [ + 65 + ], + "source": "Name: gradient-based entity resolution\nType: METHOD_OR_TECHNIQUE", + "target": "Name: 4.3.2 gradient-based entity resolution\nType: SECTION_TITLE" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "similarity", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Similarity", + "source_ids": [ + 49 + ], + "source": "Name: similarity\nType: EVALUATION_METRIC", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "entity", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Entity", + "source_ids": [ + 49 + ], + "source": "Name: entity\nType: DATASET_OR_CORPUS", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "merge", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Merge", + "source_ids": [ + 49 + ], + "source": "Name: merge\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/52'\nType: UNKNOWN" + }, + { + "src_entity_name": "graph tree link", + "tgt_entity_name": "tree structure", + "relation_name": "", + "weight": 10.0, + "description": "the graph tree link connects entities to specific tree nodes within the tree structure", + "source_ids": [ + 51 + ], + "source": "Name: tree structure\nType: TASK_OR_PROBLEM", + "target": "Name: graph tree link\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "titles", + "relation_name": "", + "weight": 9.0, + "description": "titles are examples of nodes included in the tree structure", + "source_ids": [ + 51 + ], + "source": "Name: tree structure\nType: TASK_OR_PROBLEM", + "target": "Name: titles\nType: SECTION_TITLE" + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "sections", + "relation_name": "", + "weight": 9.0, + "description": "sections are examples of nodes included in the tree structure", + "source_ids": [ + 51 + ], + "source": "Name: tree structure\nType: TASK_OR_PROBLEM", + "target": "Name: sections\nType: SECTION_TITLE" + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "e t", + "relation_name": "", + "weight": 10.0, + "description": "e t represents the nesting relationships contained within the tree structure", + "source_ids": [ + 51 + ], + "source": "Name: tree structure\nType: TASK_OR_PROBLEM", + "target": "Name: e t\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "graph tree link", + "tgt_entity_name": "knowledge graph", + "relation_name": "", + "weight": 10.0, + "description": "the graph tree link connects entities from the knowledge graph to the tree structure", + "source_ids": [ + 51 + ], + "source": "Name: knowledge graph\nType: SOFTWARE", + "target": "Name: graph tree link\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "knowledge graph", + "tgt_entity_name": "v", + "relation_name": "", + "weight": 10.0, + "description": "v represents the entities contained within the knowledge graph", + "source_ids": [ + 51 + ], + "source": "Name: knowledge graph\nType: SOFTWARE", + "target": "Name: v\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "knowledge graph", + "tgt_entity_name": "e g", + "relation_name": "", + "weight": 10.0, + "description": "e g represents the relations contained within the knowledge graph", + "source_ids": [ + 51 + ], + "source": "Name: knowledge graph\nType: SOFTWARE", + "target": "Name: e g\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "mapping m", + "tgt_entity_name": "v", + "relation_name": "", + "weight": 9.0, + "description": "the mapping m is defined as a function from the set of entities v", + "source_ids": [ + 77 + ], + "source": "Name: v\nType: PARAMETER_OR_VARIABLE", + "target": "Name: mapping m\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "", + "tgt_entity_name": "none", + "relation_name": "", + "weight": 7.0, + "description": "the final node type can be assigned the value none if a block has no level", + "source_ids": [ + 57 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: none\nType: SECTION_TITLE" + }, + { + "src_entity_name": "", + "tgt_entity_name": "case b", + "relation_name": "", + "weight": 10.0, + "description": "is the specific alias being discussed within the scenario defined as case b", + "source_ids": [ + 73 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: case b\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "", + "tgt_entity_name": "reranker", + "relation_name": "", + "weight": 8.0, + "description": "the scores of are influenced by the inherent discriminative limitations of the reranker", + "source_ids": [ + 73 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: reranker\nType: TECHNOLOGY" + }, + { + "src_entity_name": "text ranker", + "tgt_entity_name": "", + "relation_name": "", + "weight": 9.0, + "description": "text ranker uses the query to evaluate semantic relevance", + "source_ids": [ + 109 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: text ranker\nType: SOFTWARE" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "", + "relation_name": "", + "weight": 9.0, + "description": "skyline ranker uses the criterion to filter nodes", + "source_ids": [ + 109 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: skyline ranker\nType: SOFTWARE" + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "", + "relation_name": "", + "weight": 8.0, + "description": "the skyline operator utilizes as a scoring dimension", + "source_ids": [ + 109 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: skyline operator\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 10.0, + "description": "the sequence 1 is selected from the library", + "source_ids": [ + 112 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: 1\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent plan", + "tgt_entity_name": "", + "relation_name": "", + "weight": 10.0, + "description": "the agent plan method defines the generation of the plan", + "source_ids": [ + 112 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: agent plan\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "equation 8", + "tgt_entity_name": "", + "relation_name": "", + "weight": 10.0, + "description": "equation 8 defines the variable", + "source_ids": [ + 112 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: equation 8\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "", + "tgt_entity_name": "modal filter", + "relation_name": "", + "weight": 9.0, + "description": "the symbol denotes the application of a modal filter at each step", + "source_ids": [ + 122 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: modal filter\nType: TECHNOLOGY" + }, + { + "src_entity_name": "", + "tgt_entity_name": "range filter", + "relation_name": "", + "weight": 9.0, + "description": "the symbol denotes the application of a range filter at each step", + "source_ids": [ + 122 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: range filter\nType: TECHNOLOGY" + }, + { + "src_entity_name": "", + "tgt_entity_name": "nested composition", + "relation_name": "", + "weight": 10.0, + "description": "the symbol represents the nested composition of filters", + "source_ids": [ + 122 + ], + "source": "Name: \nType: UNKNOWN", + "target": "Name: nested composition\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "tree component", + "tgt_entity_name": "section nodes", + "relation_name": "", + "weight": 8.0, + "description": "the tree component organizes content blocks within section nodes", + "source_ids": [ + 52 + ], + "source": "Name: tree component\nType: SOFTWARE", + "target": "Name: section nodes\nType: PRODUCT" + }, + { + "src_entity_name": "gt link", + "tgt_entity_name": "tree component", + "relation_name": "", + "weight": 8.0, + "description": "gt link connects entities back to their corresponding tree nodes", + "source_ids": [ + 52 + ], + "source": "Name: tree component\nType: SOFTWARE", + "target": "Name: gt link\nType: TECHNOLOGY" + }, + { + "src_entity_name": "gt link", + "tgt_entity_name": "graph component", + "relation_name": "", + "weight": 8.0, + "description": "gt link is a feature within the graph component that connects entities to tree nodes", + "source_ids": [ + 52 + ], + "source": "Name: graph component\nType: SOFTWARE", + "target": "Name: gt link\nType: TECHNOLOGY" + }, + { + "src_entity_name": "graph component", + "tgt_entity_name": "semantic entities", + "relation_name": "", + "weight": 8.0, + "description": "the graph component is composed of entities and relations extracted from nodes which include semantic entities", + "source_ids": [ + 52 + ], + "source": "Name: graph component\nType: SOFTWARE", + "target": "Name: semantic entities\nType: CONCEPT" + }, + { + "src_entity_name": "gt link", + "tgt_entity_name": "semantic entities", + "relation_name": "", + "weight": 9.0, + "description": "gt link explicitly connects semantic entities back to their corresponding tree nodes", + "source_ids": [ + 52 + ], + "source": "Name: gt link\nType: TECHNOLOGY", + "target": "Name: semantic entities\nType: CONCEPT" + }, + { + "src_entity_name": "entity resolution", + "tgt_entity_name": "gt link", + "relation_name": "", + "weight": 9.0, + "description": "gt link is refined during the entity resolution process", + "source_ids": [ + 77 + ], + "source": "Name: gt link\nType: TECHNOLOGY", + "target": "Name: entity resolution\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gt link", + "tgt_entity_name": "e q", + "relation_name": "", + "weight": 9.0, + "description": "gt link is the mechanism used to link sections to entities e q", + "source_ids": [ + 104 + ], + "source": "Name: gt link\nType: TECHNOLOGY", + "target": "Name: e q\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gt link", + "tgt_entity_name": "section node", + "relation_name": "", + "weight": 8.0, + "description": "gt link links sections nodes to entities", + "source_ids": [ + 104 + ], + "source": "Name: gt link\nType: TECHNOLOGY", + "target": "Name: section node\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "text", + "tgt_entity_name": "section nodes", + "relation_name": "", + "weight": 7.0, + "description": "text serves as a leaf node nested within section nodes", + "source_ids": [ + 52 + ], + "source": "Name: text\nType: PRODUCT", + "target": "Name: section nodes\nType: PRODUCT" + }, + { + "src_entity_name": "text", + "tgt_entity_name": "content blocks", + "relation_name": "", + "weight": 10.0, + "description": "text is identified as a type of content block", + "source_ids": [ + 52 + ], + "source": "Name: text\nType: PRODUCT", + "target": "Name: content blocks\nType: PRODUCT" + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "text", + "relation_name": "", + "weight": 8.0, + "description": "the node set retains nodes of the type text", + "source_ids": [ + 58 + ], + "source": "Name: text\nType: PRODUCT", + "target": "Name: node set\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "tables", + "tgt_entity_name": "section nodes", + "relation_name": "", + "weight": 7.0, + "description": "tables serve as a leaf node nested within section nodes", + "source_ids": [ + 52 + ], + "source": "Name: tables\nType: PRODUCT", + "target": "Name: section nodes\nType: PRODUCT" + }, + { + "src_entity_name": "tables", + "tgt_entity_name": "content blocks", + "relation_name": "", + "weight": 10.0, + "description": "tables are identified as a type of content block", + "source_ids": [ + 52 + ], + "source": "Name: tables\nType: PRODUCT", + "target": "Name: content blocks\nType: PRODUCT" + }, + { + "src_entity_name": "images", + "tgt_entity_name": "section nodes", + "relation_name": "", + "weight": 7.0, + "description": "images serve as a leaf node nested within section nodes", + "source_ids": [ + 52 + ], + "source": "Name: images\nType: PRODUCT", + "target": "Name: section nodes\nType: PRODUCT" + }, + { + "src_entity_name": "images", + "tgt_entity_name": "content blocks", + "relation_name": "", + "weight": 10.0, + "description": "images are identified as a type of content block", + "source_ids": [ + 52 + ], + "source": "Name: images\nType: PRODUCT", + "target": "Name: content blocks\nType: PRODUCT" + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "leaf nodes", + "relation_name": "", + "weight": 9.0, + "description": "content blocks serve as leaf nodes within the structure", + "source_ids": [ + 52 + ], + "source": "Name: content blocks\nType: PRODUCT", + "target": "Name: leaf nodes\nType: PRODUCT" + }, + { + "src_entity_name": "t", + "tgt_entity_name": "task or problem", + "relation_name": "", + "weight": 5.0, + "description": "t represents the structured hierarchical tree which is the outcome of the transformation task described", + "source_ids": [ + 54 + ], + "source": "Name: t\nType: TASK_OR_PROBLEM", + "target": "Name: task or problem\nType: UNKNOWN" + }, + { + "src_entity_name": "robust layout parsing", + "tgt_entity_name": "t", + "relation_name": "", + "weight": 9.0, + "description": "robust layout parsing is a step used to create the structured hierarchical tree t", + "source_ids": [ + 54 + ], + "source": "Name: t\nType: TASK_OR_PROBLEM", + "target": "Name: robust layout parsing\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "intelligent section filtering", + "tgt_entity_name": "t", + "relation_name": "", + "weight": 9.0, + "description": "intelligent section filtering is a step used to create the structured hierarchical tree t", + "source_ids": [ + 54 + ], + "source": "Name: t\nType: TASK_OR_PROBLEM", + "target": "Name: intelligent section filtering\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "raw document", + "tgt_entity_name": "robust layout parsing", + "relation_name": "", + "weight": 8.0, + "description": "raw document is the input processed by the robust layout parsing step", + "source_ids": [ + 54 + ], + "source": "Name: raw document\nType: PRODUCT", + "target": "Name: robust layout parsing\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "raw document", + "tgt_entity_name": "intelligent section filtering", + "relation_name": "", + "weight": 8.0, + "description": "raw document is the input processed by the intelligent section filtering step", + "source_ids": [ + 54 + ], + "source": "Name: raw document\nType: PRODUCT", + "target": "Name: intelligent section filtering\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "layout analysis", + "tgt_entity_name": "4.2.1 layout parsing", + "relation_name": "", + "weight": 9.5, + "description": "Layout Analysis is a core methodological component discussed within section 4.2.1.", + "source_ids": [ + 55 + ], + "source": "Name: 4.2.1 layout parsing\nType: SECTION_TITLE", + "target": "Name: layout analysis\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "recognition models", + "tgt_entity_name": "4.2.1 layout parsing", + "relation_name": "", + "weight": 9.5, + "description": "Recognition Models are the primary tools used in the process detailed in section 4.2.1.", + "source_ids": [ + 55 + ], + "source": "Name: 4.2.1 layout parsing\nType: SECTION_TITLE", + "target": "Name: recognition models\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "document d", + "tgt_entity_name": "4.2.1 layout parsing", + "relation_name": "", + "weight": 10.0, + "description": "Document D is the specific input entity being processed in section 4.2.1.", + "source_ids": [ + 55 + ], + "source": "Name: 4.2.1 layout parsing\nType: SECTION_TITLE", + "target": "Name: document d\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "the output", + "tgt_entity_name": "primitive", + "relation_name": "", + "weight": 9.0, + "description": "the output consists of a sequence of primitives", + "source_ids": [ + 56 + ], + "source": "Name: the output\nType: TASK_OR_PROBLEM", + "target": "Name: primitive\nType: CONCEPT" + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "image", + "relation_name": "", + "weight": 7.0, + "description": "section filtering aims to correct blocks erroneously parsed as title such as descriptive text within images", + "source_ids": [ + 57 + ], + "source": "Name: section filtering\nType: TASK_OR_PROBLEM", + "target": "Name: image\nType: IMAGE" + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 7.0, + "description": "section filtering aims to correct blocks erroneously parsed as title such as borderless table headers", + "source_ids": [ + 57 + ], + "source": "Name: section filtering\nType: TASK_OR_PROBLEM", + "target": "Name: table\nType: TABLE" + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "b", + "relation_name": "", + "weight": 9.0, + "description": "section filtering selects the candidate subset b for analysis", + "source_ids": [ + 57 + ], + "source": "Name: section filtering\nType: TASK_OR_PROBLEM", + "target": "Name: b\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "b title", + "relation_name": "", + "weight": 9.0, + "description": "section filtering selects the candidate subset b title for analysis", + "source_ids": [ + 57 + ], + "source": "Name: section filtering\nType: TASK_OR_PROBLEM", + "target": "Name: b title\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "text", + "relation_name": "", + "weight": 8.0, + "description": "section filtering corrects blocks erroneously parsed as title by re classifying them as text", + "source_ids": [ + 57 + ], + "source": "Name: section filtering\nType: TASK_OR_PROBLEM", + "target": "Name: text\nType: SECTION_TITLE" + }, + { + "src_entity_name": "layout parsing phase", + "tgt_entity_name": "image", + "relation_name": "", + "weight": 8.0, + "description": "the layout parsing phase identifies image as a type of block", + "source_ids": [ + 59 + ], + "source": "Name: image\nType: IMAGE", + "target": "Name: layout parsing phase\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "image", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 8.5, + "description": "The Image node type triggers the use of the Vision Language Model in this section's logic.", + "source_ids": [ + 63 + ], + "source": "Name: image\nType: IMAGE", + "target": "Name: 4.3.1 kg construction\nType: SECTION_TITLE" + }, + { + "src_entity_name": "filters", + "tgt_entity_name": "image", + "relation_name": "", + "weight": 8.0, + "description": "filters can be of type image to target visual elements", + "source_ids": [ + 258 + ], + "source": "Name: image\nType: IMAGE", + "target": "Name: filters\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "image", + "tgt_entity_name": "figures", + "relation_name": "", + "weight": 9.0, + "description": "figures are examples of images", + "source_ids": [ + 258 + ], + "source": "Name: image\nType: IMAGE", + "target": "Name: figures\nType: IMAGE" + }, + { + "src_entity_name": "filters", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 8.0, + "description": "filters can be of type table to target tabular data", + "source_ids": [ + 258 + ], + "source": "Name: table\nType: TABLE", + "target": "Name: filters\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "filter modal", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 8.0, + "description": "filter modal applies the explicit constraints c generated during the plan", + "source_ids": [ + 102 + ], + "source": "Name: c\nType: PARAMETER_OR_VARIABLE", + "target": "Name: filter modal\nType: TECHNOLOGY" + }, + { + "src_entity_name": "filter range", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 8.0, + "description": "filter range applies the explicit constraints c generated during the plan", + "source_ids": [ + 102 + ], + "source": "Name: c\nType: PARAMETER_OR_VARIABLE", + "target": "Name: filter range\nType: TECHNOLOGY" + }, + { + "src_entity_name": "c", + "tgt_entity_name": "modal types", + "relation_name": "", + "weight": 10.0, + "description": "modal types are examples of the explicit constraints c", + "source_ids": [ + 102 + ], + "source": "Name: c\nType: PARAMETER_OR_VARIABLE", + "target": "Name: modal types\nType: CONCEPT" + }, + { + "src_entity_name": "c", + "tgt_entity_name": "page ranges", + "relation_name": "", + "weight": 10.0, + "description": "page ranges are examples of the explicit constraints c", + "source_ids": [ + 102 + ], + "source": "Name: c\nType: PARAMETER_OR_VARIABLE", + "target": "Name: page ranges\nType: CONCEPT" + }, + { + "src_entity_name": "c", + "tgt_entity_name": "plan", + "relation_name": "", + "weight": 9.0, + "description": "the constraints c are generated during the plan", + "source_ids": [ + 102 + ], + "source": "Name: c\nType: PARAMETER_OR_VARIABLE", + "target": "Name: plan\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "node set", + "relation_name": "", + "weight": 9.0, + "description": "the tree is constructed using the node set which contains all blocks from the filtering process", + "source_ids": [ + 58 + ], + "source": "Name: tree\nType: TASK_OR_PROBLEM", + "target": "Name: node set\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "edge set", + "tgt_entity_name": "tree", + "relation_name": "", + "weight": 9.0, + "description": "the edge set is established to define the structure of the tree", + "source_ids": [ + 58 + ], + "source": "Name: tree\nType: TASK_OR_PROBLEM", + "target": "Name: edge set\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "hierarchical levels", + "relation_name": "", + "weight": 8.0, + "description": "hierarchical levels are used to infer relationships within the tree structure", + "source_ids": [ + 58 + ], + "source": "Name: tree\nType: TASK_OR_PROBLEM", + "target": "Name: hierarchical levels\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "document order", + "relation_name": "", + "weight": 8.0, + "description": "document order is used to assemble the complete tree structure", + "source_ids": [ + 58 + ], + "source": "Name: tree\nType: TASK_OR_PROBLEM", + "target": "Name: document order\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "tree", + "relation_name": "", + "weight": 9.0, + "description": "the selector operators operate on the tree t n e t to produce a filtered subset", + "source_ids": [ + 102 + ], + "source": "Name: tree\nType: TASK_OR_PROBLEM", + "target": "Name: selector\nType: TECHNOLOGY" + }, + { + "src_entity_name": "filter modal", + "tgt_entity_name": "tree", + "relation_name": "", + "weight": 8.0, + "description": "filter modal operates on the tree to produce a filtered subset", + "source_ids": [ + 102 + ], + "source": "Name: tree\nType: TASK_OR_PROBLEM", + "target": "Name: filter modal\nType: TECHNOLOGY" + }, + { + "src_entity_name": "filter range", + "tgt_entity_name": "tree", + "relation_name": "", + "weight": 8.0, + "description": "filter range operates on the tree to produce a filtered subset", + "source_ids": [ + 102 + ], + "source": "Name: tree\nType: TASK_OR_PROBLEM", + "target": "Name: filter range\nType: TECHNOLOGY" + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 10.0, + "description": "the tree t is composed of the set of nodes n", + "source_ids": [ + 102 + ], + "source": "Name: tree\nType: TASK_OR_PROBLEM", + "target": "Name: nodes\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "edges", + "relation_name": "", + "weight": 10.0, + "description": "the tree t is composed of the set of edges e t", + "source_ids": [ + 102 + ], + "source": "Name: tree\nType: TASK_OR_PROBLEM", + "target": "Name: edges\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "section", + "relation_name": "", + "weight": 8.0, + "description": "the node set retains nodes of the type section", + "source_ids": [ + 58 + ], + "source": "Name: node set\nType: TASK_OR_PROBLEM", + "target": "Name: section\nType: PRODUCT" + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 8.0, + "description": "the node set retains nodes of the type table", + "source_ids": [ + 58 + ], + "source": "Name: node set\nType: TASK_OR_PROBLEM", + "target": "Name: table\nType: PRODUCT" + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "image", + "relation_name": "", + "weight": 8.0, + "description": "the node set retains nodes of the type image", + "source_ids": [ + 58 + ], + "source": "Name: node set\nType: TASK_OR_PROBLEM", + "target": "Name: image\nType: PRODUCT" + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "filtering", + "relation_name": "", + "weight": 9.0, + "description": "the node set is composed of blocks resulting from the filtering process", + "source_ids": [ + 58 + ], + "source": "Name: node set\nType: TASK_OR_PROBLEM", + "target": "Name: filtering\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "re classification", + "relation_name": "", + "weight": 9.0, + "description": "the node set is composed of blocks resulting from the re classification process", + "source_ids": [ + 58 + ], + "source": "Name: node set\nType: TASK_OR_PROBLEM", + "target": "Name: re classification\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "edge set", + "tgt_entity_name": "parent child nesting relationships", + "relation_name": "", + "weight": 10.0, + "description": "the edge set represents the parent child nesting relationships", + "source_ids": [ + 58 + ], + "source": "Name: edge set\nType: TASK_OR_PROBLEM", + "target": "Name: parent child nesting relationships\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "v table", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 10.0, + "description": "v table is the distinct entity created to represent the table logical type", + "source_ids": [ + 64 + ], + "source": "Name: table\nType: PRODUCT", + "target": "Name: v table\nType: PRODUCT" + }, + { + "src_entity_name": "global", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 10.0, + "description": "the global process specifically targets and filters for items of type table", + "source_ids": [ + 251 + ], + "source": "Name: table\nType: PRODUCT", + "target": "Name: global\nType: CONCEPT" + }, + { + "src_entity_name": "final tree structure", + "tgt_entity_name": "document order", + "relation_name": "", + "weight": 9.0, + "description": "the final tree structure is assembled based on the document order of the nodes", + "source_ids": [ + 59 + ], + "source": "Name: document order\nType: PARAMETER_OR_VARIABLE", + "target": "Name: final tree structure\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "node", + "tgt_entity_name": "content", + "relation_name": "", + "weight": 9.0, + "description": "each node retains its content", + "source_ids": [ + 58 + ], + "source": "Name: content\nType: PARAMETER_OR_VARIABLE", + "target": "Name: node\nType: UNKNOWN" + }, + { + "src_entity_name": "node", + "tgt_entity_name": "final node type", + "relation_name": "", + "weight": 9.0, + "description": "each node retains its final node type", + "source_ids": [ + 58 + ], + "source": "Name: final node type\nType: PARAMETER_OR_VARIABLE", + "target": "Name: node\nType: UNKNOWN" + }, + { + "src_entity_name": "layout parsing phase", + "tgt_entity_name": "title text table", + "relation_name": "", + "weight": 8.0, + "description": "the layout parsing phase identifies title text table as a type of block", + "source_ids": [ + 59 + ], + "source": "Name: layout parsing phase\nType: TASK_OR_PROBLEM", + "target": "Name: title text table\nType: PRODUCT" + }, + { + "src_entity_name": "layout parsing phase", + "tgt_entity_name": "section filtering phase", + "relation_name": "", + "weight": 6.0, + "description": "the layout parsing phase precedes the section filtering phase in the document processing workflow", + "source_ids": [ + 59 + ], + "source": "Name: layout parsing phase\nType: TASK_OR_PROBLEM", + "target": "Name: section filtering phase\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "section filtering phase", + "tgt_entity_name": "method", + "relation_name": "", + "weight": 9.0, + "description": "the section filtering phase analyzes method as a title candidate", + "source_ids": [ + 59 + ], + "source": "Name: section filtering phase\nType: TASK_OR_PROBLEM", + "target": "Name: method\nType: SECTION_TITLE" + }, + { + "src_entity_name": "section filtering phase", + "tgt_entity_name": "experiment", + "relation_name": "", + "weight": 9.0, + "description": "the section filtering phase analyzes experiment as a title candidate", + "source_ids": [ + 59 + ], + "source": "Name: section filtering phase\nType: TASK_OR_PROBLEM", + "target": "Name: experiment\nType: SECTION_TITLE" + }, + { + "src_entity_name": "section filtering phase", + "tgt_entity_name": "moe layer", + "relation_name": "", + "weight": 9.0, + "description": "the section filtering phase analyzes moe layer which was erroneously tagged and re classified", + "source_ids": [ + 59 + ], + "source": "Name: section filtering phase\nType: TASK_OR_PROBLEM", + "target": "Name: moe layer\nType: SECTION_TITLE" + }, + { + "src_entity_name": "section filtering phase", + "tgt_entity_name": "final tree structure", + "relation_name": "", + "weight": 7.0, + "description": "the section filtering phase contributes to the creation of the final tree structure", + "source_ids": [ + 59 + ], + "source": "Name: section filtering phase\nType: TASK_OR_PROBLEM", + "target": "Name: final tree structure\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "method", + "tgt_entity_name": "fontsize", + "relation_name": "", + "weight": 10.0, + "description": "the method block has a fontsize of 14", + "source_ids": [ + 59 + ], + "source": "Name: method\nType: SECTION_TITLE", + "target": "Name: fontsize\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "method", + "tgt_entity_name": "14", + "relation_name": "", + "weight": 10.0, + "description": "the method block is associated with the measurement value 14", + "source_ids": [ + 59 + ], + "source": "Name: method\nType: SECTION_TITLE", + "target": "Name: 14\nType: MEASUREMENT" + }, + { + "src_entity_name": "method", + "tgt_entity_name": "level", + "relation_name": "", + "weight": 10.0, + "description": "the method block is identified as having a level of 2", + "source_ids": [ + 59 + ], + "source": "Name: method\nType: SECTION_TITLE", + "target": "Name: level\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "method", + "tgt_entity_name": "2", + "relation_name": "", + "weight": 10.0, + "description": "the method block is associated with the measurement value 2", + "source_ids": [ + 59 + ], + "source": "Name: method\nType: SECTION_TITLE", + "target": "Name: 2\nType: MEASUREMENT" + }, + { + "src_entity_name": "experiment", + "tgt_entity_name": "fontsize", + "relation_name": "", + "weight": 10.0, + "description": "the experiment block has a fontsize of 14", + "source_ids": [ + 59 + ], + "source": "Name: experiment\nType: SECTION_TITLE", + "target": "Name: fontsize\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "experiment", + "tgt_entity_name": "14", + "relation_name": "", + "weight": 10.0, + "description": "the experiment block is associated with the measurement value 14", + "source_ids": [ + 59 + ], + "source": "Name: experiment\nType: SECTION_TITLE", + "target": "Name: 14\nType: MEASUREMENT" + }, + { + "src_entity_name": "experiment", + "tgt_entity_name": "level", + "relation_name": "", + "weight": 10.0, + "description": "the experiment block is identified as having a level of 2", + "source_ids": [ + 59 + ], + "source": "Name: experiment\nType: SECTION_TITLE", + "target": "Name: level\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "experiment", + "tgt_entity_name": "2", + "relation_name": "", + "weight": 10.0, + "description": "the experiment block is associated with the measurement value 2", + "source_ids": [ + 59 + ], + "source": "Name: experiment\nType: SECTION_TITLE", + "target": "Name: 2\nType: MEASUREMENT" + }, + { + "src_entity_name": "moe layer", + "tgt_entity_name": "fontsize", + "relation_name": "", + "weight": 10.0, + "description": "the moe layer block has a fontsize of 20", + "source_ids": [ + 59 + ], + "source": "Name: moe layer\nType: SECTION_TITLE", + "target": "Name: fontsize\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "moe layer", + "tgt_entity_name": "20", + "relation_name": "", + "weight": 10.0, + "description": "the moe layer block is associated with the measurement value 20", + "source_ids": [ + 59 + ], + "source": "Name: moe layer\nType: SECTION_TITLE", + "target": "Name: 20\nType: MEASUREMENT" + }, + { + "src_entity_name": "moe layer", + "tgt_entity_name": "level", + "relation_name": "", + "weight": 10.0, + "description": "the moe layer block is identified as having a level of none", + "source_ids": [ + 59 + ], + "source": "Name: moe layer\nType: SECTION_TITLE", + "target": "Name: level\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "moe layer", + "tgt_entity_name": "none", + "relation_name": "", + "weight": 10.0, + "description": "the moe layer block is associated with the measurement value none", + "source_ids": [ + 59 + ], + "source": "Name: moe layer\nType: SECTION_TITLE", + "target": "Name: none\nType: MEASUREMENT" + }, + { + "src_entity_name": "final tree structure", + "tgt_entity_name": "level", + "relation_name": "", + "weight": 9.0, + "description": "the final tree structure is assembled based on the determined levels of the nodes", + "source_ids": [ + 59 + ], + "source": "Name: final tree structure\nType: TASK_OR_PROBLEM", + "target": "Name: level\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "proceedings of the acm on management of data", + "tgt_entity_name": "2", + "relation_name": "", + "weight": 10.0, + "description": "the publication volume is 2", + "source_ids": [ + 199 + ], + "source": "Name: 2\nType: MEASUREMENT", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "tree t", + "tgt_entity_name": "knowledge graph g", + "relation_name": "", + "weight": 9.0, + "description": "tree t is the source from which entities are extracted to populate knowledge graph g", + "source_ids": [ + 62 + ], + "source": "Name: tree t\nType: TASK_OR_PROBLEM", + "target": "Name: knowledge graph g\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "tree t", + "tgt_entity_name": "tree nodes", + "relation_name": "", + "weight": 10.0, + "description": "tree nodes are the constituent parts of tree t that serve as the source for entity extraction", + "source_ids": [ + 62 + ], + "source": "Name: tree t\nType: TASK_OR_PROBLEM", + "target": "Name: tree nodes\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "knowledge graph g", + "tgt_entity_name": "tree nodes", + "relation_name": "", + "weight": 9.0, + "description": "knowledge graph g is populated by extracting and refining entities from tree nodes", + "source_ids": [ + 62 + ], + "source": "Name: knowledge graph g\nType: TASK_OR_PROBLEM", + "target": "Name: tree nodes\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "tree t", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 9.5, + "description": "The Tree T provides the input nodes that are iterated over during the construction process.", + "source_ids": [ + 63 + ], + "source": "Name: 4.3.1 kg construction\nType: SECTION_TITLE", + "target": "Name: tree t\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "vision language model", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 9.0, + "description": "The Vision Language Model is the method/tool utilized for extracting data from visual nodes within this section.", + "source_ids": [ + 63 + ], + "source": "Name: 4.3.1 kg construction\nType: SECTION_TITLE", + "target": "Name: vision language model\nType: SOFTWARE" + }, + { + "src_entity_name": "mapping m", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 9.5, + "description": "The Mapping M is the critical output artifact generated by recording entity origins in this section.", + "source_ids": [ + 63 + ], + "source": "Name: 4.3.1 kg construction\nType: SECTION_TITLE", + "target": "Name: mapping m\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "mapping m", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 8.0, + "description": "the mapping m bi directionally links the entities in g to their structural locations", + "source_ids": [ + 77 + ], + "source": "Name: mapping m\nType: EQUATION_OR_FORMULA", + "target": "Name: g\nType: CONCEPT" + }, + { + "src_entity_name": "mapping m", + "tgt_entity_name": "t", + "relation_name": "", + "weight": 8.0, + "description": "the mapping m links entities to the set of their structural locations nodes in t", + "source_ids": [ + 77 + ], + "source": "Name: mapping m\nType: EQUATION_OR_FORMULA", + "target": "Name: t\nType: CONCEPT" + }, + { + "src_entity_name": "mapping m", + "tgt_entity_name": "p n", + "relation_name": "", + "weight": 9.0, + "description": "the mapping m maps entities to the power set of nodes p n", + "source_ids": [ + 77 + ], + "source": "Name: mapping m\nType: EQUATION_OR_FORMULA", + "target": "Name: p n\nType: MATHEMATICAL_CONCEPT" + }, + { + "src_entity_name": "row", + "tgt_entity_name": "v table", + "relation_name": "", + "weight": 9.0, + "description": "row headers are linked to v table via a containedin relationship", + "source_ids": [ + 64 + ], + "source": "Name: v table\nType: PRODUCT", + "target": "Name: row\nType: PRODUCT" + }, + { + "src_entity_name": "column", + "tgt_entity_name": "v table", + "relation_name": "", + "weight": 9.0, + "description": "column headers are linked to v table via a containedin relationship", + "source_ids": [ + 64 + ], + "source": "Name: v table\nType: PRODUCT", + "target": "Name: column\nType: PRODUCT" + }, + { + "src_entity_name": "header", + "tgt_entity_name": "v table", + "relation_name": "", + "weight": 9.0, + "description": "row and column headers are explicitly extracted and linked to v table", + "source_ids": [ + 64 + ], + "source": "Name: v table\nType: PRODUCT", + "target": "Name: header\nType: PRODUCT" + }, + { + "src_entity_name": "v table", + "tgt_entity_name": "node", + "relation_name": "", + "weight": 8.0, + "description": "v table is created as a distinct entity from the content of a specific node", + "source_ids": [ + 64 + ], + "source": "Name: v table\nType: PRODUCT", + "target": "Name: node\nType: CONCEPT" + }, + { + "src_entity_name": "row", + "tgt_entity_name": "header", + "relation_name": "", + "weight": 9.0, + "description": "row headers are a specific type of header extracted from table nodes", + "source_ids": [ + 64 + ], + "source": "Name: row\nType: PRODUCT", + "target": "Name: header\nType: PRODUCT" + }, + { + "src_entity_name": "row", + "tgt_entity_name": "column", + "relation_name": "", + "weight": 7.0, + "description": "row and column headers are both explicitly extracted components of table nodes", + "source_ids": [ + 64 + ], + "source": "Name: row\nType: PRODUCT", + "target": "Name: column\nType: PRODUCT" + }, + { + "src_entity_name": "column", + "tgt_entity_name": "header", + "relation_name": "", + "weight": 9.0, + "description": "column headers are a specific type of header extracted from table nodes", + "source_ids": [ + 64 + ], + "source": "Name: column\nType: PRODUCT", + "target": "Name: header\nType: PRODUCT" + }, + { + "src_entity_name": "structural semantics", + "tgt_entity_name": "logical types", + "relation_name": "", + "weight": 9.0, + "description": "structural semantics are preserved specifically for logical types like table and formula", + "source_ids": [ + 64 + ], + "source": "Name: structural semantics\nType: CONCEPT", + "target": "Name: logical types\nType: CONCEPT" + }, + { + "src_entity_name": "entity resolution", + "tgt_entity_name": "4.3.2 gradient-based entity resolution", + "relation_name": "", + "weight": 10.0, + "description": "The task of 'Entity Resolution' is the central subject matter detailed in section 4.3.2.", + "source_ids": [ + 65 + ], + "source": "Name: 4.3.2 gradient-based entity resolution\nType: SECTION_TITLE", + "target": "Name: entity resolution\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "entity resolution", + "tgt_entity_name": "canonical entity", + "relation_name": "", + "weight": 9.0, + "description": "during entity resolution entities are merged into a canonical entity", + "source_ids": [ + 77 + ], + "source": "Name: entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: canonical entity\nType: CONCEPT" + }, + { + "src_entity_name": "entity resolution", + "tgt_entity_name": "v n", + "relation_name": "", + "weight": 9.0, + "description": "during entity resolution the entity v n is merged into a canonical entity", + "source_ids": [ + 77 + ], + "source": "Name: entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: v n\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "entity resolution", + "tgt_entity_name": "v sel", + "relation_name": "", + "weight": 9.0, + "description": "the entity v n is merged into the canonical entity v sel during entity resolution", + "source_ids": [ + 77 + ], + "source": "Name: entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: v sel\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "figure 13", + "tgt_entity_name": "entity resolution", + "relation_name": "", + "weight": 9.0, + "description": "figure 13 contains the prompt used for the entity resolution judgement task", + "source_ids": [ + 284 + ], + "source": "Name: entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: figure 13\nType: IMAGE" + }, + { + "src_entity_name": "prompt", + "tgt_entity_name": "entity resolution", + "relation_name": "", + "weight": 9.0, + "description": "the prompt is specifically designed for the entity resolution judgement task", + "source_ids": [ + 284 + ], + "source": "Name: entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: prompt\nType: SOFTWARE" + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "dirty er", + "relation_name": "", + "weight": 9.0, + "description": "er methods are often designed for batch processing across multiple data sources commonly referred to as dirty er", + "source_ids": [ + 66 + ], + "source": "Name: er methods\nType: TASK_OR_PROBLEM", + "target": "Name: dirty er\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "a", + "relation_name": "", + "weight": 8.0, + "description": "er methods aim to merge entities like a b and c as the same concept", + "source_ids": [ + 66 + ], + "source": "Name: er methods\nType: TASK_OR_PROBLEM", + "target": "Name: a\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "b", + "relation_name": "", + "weight": 8.0, + "description": "er methods aim to merge entities like a b and c as the same concept", + "source_ids": [ + 66 + ], + "source": "Name: er methods\nType: TASK_OR_PROBLEM", + "target": "Name: b\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 8.0, + "description": "er methods aim to merge entities like a b and c as the same concept", + "source_ids": [ + 66 + ], + "source": "Name: er methods\nType: TASK_OR_PROBLEM", + "target": "Name: c\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "a b", + "relation_name": "", + "weight": 9.0, + "description": "er methods require finding all possible matching pairs such as a b to confirm equivalence", + "source_ids": [ + 66 + ], + "source": "Name: er methods\nType: TASK_OR_PROBLEM", + "target": "Name: a b\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "a c", + "relation_name": "", + "weight": 9.0, + "description": "er methods require finding all possible matching pairs such as a c to confirm equivalence", + "source_ids": [ + 66 + ], + "source": "Name: er methods\nType: TASK_OR_PROBLEM", + "target": "Name: a c\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "b c", + "relation_name": "", + "weight": 9.0, + "description": "er methods require finding all possible matching pairs such as b c to confirm equivalence", + "source_ids": [ + 66 + ], + "source": "Name: er methods\nType: TASK_OR_PROBLEM", + "target": "Name: b c\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "12", + "relation_name": "", + "weight": 6.0, + "description": "the text cites reference 12 in the context of ensuring accurate entity resolution", + "source_ids": [ + 66 + ], + "source": "Name: er methods\nType: TASK_OR_PROBLEM", + "target": "Name: 12\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "o n 2", + "relation_name": "", + "weight": 9.0, + "description": "the process of er methods leads to a quadratic o n 2 number of pairwise comparisons", + "source_ids": [ + 66 + ], + "source": "Name: er methods\nType: TASK_OR_PROBLEM", + "target": "Name: o n 2\nType: MEASUREMENT" + }, + { + "src_entity_name": "a", + "tgt_entity_name": "b", + "relation_name": "", + "weight": 7.0, + "description": "a and b are compared as a pair a b to confirm their equivalence", + "source_ids": [ + 66 + ], + "source": "Name: a\nType: TASK_OR_PROBLEM", + "target": "Name: b\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "a", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 7.0, + "description": "a and c are compared as a pair a c to confirm their equivalence", + "source_ids": [ + 66 + ], + "source": "Name: a\nType: TASK_OR_PROBLEM", + "target": "Name: c\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "b", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 7.0, + "description": "b and c are compared as a pair b c to confirm their equivalence", + "source_ids": [ + 66 + ], + "source": "Name: b\nType: TASK_OR_PROBLEM", + "target": "Name: c\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "clean er", + "relation_name": "", + "weight": 9.0, + "description": "the method operates on a single document simplified as the clean er", + "source_ids": [ + 67 + ], + "source": "Name: gradient based er method\nType: TECHNOLOGY", + "target": "Name: clean er\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "database", + "relation_name": "", + "weight": 8.0, + "description": "the method determines where a new entity fits among entities already in the database", + "source_ids": [ + 67 + ], + "source": "Name: gradient based er method\nType: TECHNOLOGY", + "target": "Name: database\nType: SOFTWARE" + }, + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "top k most relevant candidates", + "relation_name": "", + "weight": 8.0, + "description": "the method yields scoring patterns when a new entity is reranked against its top k candidates", + "source_ids": [ + 67 + ], + "source": "Name: gradient based er method\nType: TECHNOLOGY", + "target": "Name: top k most relevant candidates\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "entity", + "relation_name": "", + "weight": 10.0, + "description": "the method performs entity resolution incrementally as each new entity is extracted", + "source_ids": [ + 67 + ], + "source": "Name: gradient based er method\nType: TECHNOLOGY", + "target": "Name: entity\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "quadratic batch problem", + "relation_name": "", + "weight": 9.0, + "description": "the method transforms the quadratic batch problem into a simpler task", + "source_ids": [ + 67 + ], + "source": "Name: gradient based er method\nType: TECHNOLOGY", + "target": "Name: quadratic batch problem\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "repeated lookup task", + "relation_name": "", + "weight": 9.0, + "description": "the method transforms the problem into a repeated lookup task", + "source_ids": [ + 67 + ], + "source": "Name: gradient based er method\nType: TECHNOLOGY", + "target": "Name: repeated lookup task\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "entity", + "tgt_entity_name": "database", + "relation_name": "", + "weight": 9.0, + "description": "the new entity is determined to fit among the already processed entities in the database", + "source_ids": [ + 67 + ], + "source": "Name: database\nType: SOFTWARE", + "target": "Name: entity\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "entity", + "tgt_entity_name": "top k most relevant candidates", + "relation_name": "", + "weight": 9.0, + "description": "the new entity is reranked against its top k most relevant candidates", + "source_ids": [ + 67 + ], + "source": "Name: top k most relevant candidates\nType: EVALUATION_METRIC", + "target": "Name: entity\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "incremental process", + "tgt_entity_name": "scoring patterns", + "relation_name": "", + "weight": 8.0, + "description": "the incremental process yields two distinct scoring patterns", + "source_ids": [ + 67 + ], + "source": "Name: scoring patterns\nType: EVALUATION_METRIC", + "target": "Name: incremental process\nType: UNKNOWN" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "v n", + "relation_name": "", + "weight": 10.0, + "description": "algorithm 1 processes the new entity v n by retrieving candidates and making a decision", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: v n\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "e c", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 retrieves the top k candidates e c from the vector database db", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: e c\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "db", + "relation_name": "", + "weight": 9.0, + "description": "the vector database db is the source from which candidates e c are retrieved", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: db\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "r", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 uses the reranker r to re rank candidates e c against v n", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: r\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "s", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 sorts candidates based on their scores s", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: s\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "sel", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 initializes and iterates through the selection set sel", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: sel\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 8.0, + "description": "algorithm 1 uses the gradient threshold g to determine if a score drop is sharp", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: g\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "case a", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 identifies case a when the selection set sel is identical to e c", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: case a\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "case b", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 identifies case b when a gradient is found in the selection set sel", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: case b\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "v sel", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 selects the canonical entity v sel from the selection set sel in case b", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: v sel\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "lines 1 3", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 executes the steps outlined in lines 1 3 to retrieve and rerank candidates", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: lines 1 3\nType: SECTION_TITLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "line 4", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 executes the initialization step described in line 4", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: line 4\nType: SECTION_TITLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "lines 5 8", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 executes the iteration logic described in lines 5 8", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: lines 5 8\nType: SECTION_TITLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "lines 7 8", + "relation_name": "", + "weight": 8.0, + "description": "the logic in lines 7 8 is part of the iteration process within algorithm 1", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: lines 7 8\nType: SECTION_TITLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "line 8", + "relation_name": "", + "weight": 8.0, + "description": "line 8 defines the break condition within the loop of algorithm 1", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: line 8\nType: SECTION_TITLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "lines 9 14", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 executes the decision logic described in lines 9 14", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: lines 9 14\nType: SECTION_TITLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "line 9 10", + "relation_name": "", + "weight": 8.0, + "description": "lines 9 10 are the specific actions taken when case a is identified in algorithm 1", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: line 9 10\nType: SECTION_TITLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "lines 12 14", + "relation_name": "", + "weight": 8.0, + "description": "lines 12 14 are the specific actions taken when case b is identified in algorithm 1", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: lines 12 14\nType: SECTION_TITLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "line 13", + "relation_name": "", + "weight": 8.0, + "description": "line 13 is a step within the case b logic of algorithm 1", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: line 13\nType: SECTION_TITLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "line 15", + "relation_name": "", + "weight": 9.0, + "description": "line 15 is the final step of algorithm 1 where results are returned", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: line 15\nType: SECTION_TITLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "score", + "relation_name": "", + "weight": 9.0, + "description": "the variable score is initialized and updated during the execution of algorithm 1", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: score\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "v c", + "relation_name": "", + "weight": 9.0, + "description": "the variable v c is the current candidate processed within the loop of algorithm 1", + "source_ids": [ + 75 + ], + "source": "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "target": "Name: v c\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "kg g", + "tgt_entity_name": "new entity v n", + "relation_name": "", + "weight": 8.0, + "description": "the new entity v n is added to or processed within the knowledge graph g", + "source_ids": [ + 70 + ], + "source": "Name: kg g\nType: TASK_OR_PROBLEM", + "target": "Name: new entity v n\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "rerank model r", + "tgt_entity_name": "entity vector database db", + "relation_name": "", + "weight": 7.0, + "description": "the rerank model r likely utilizes the entity vector database db to perform its ranking tasks", + "source_ids": [ + 70 + ], + "source": "Name: rerank model r\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: entity vector database db\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "threshold of gradient g", + "tgt_entity_name": "rerank model r", + "relation_name": "", + "weight": 6.0, + "description": "the threshold of gradient g is a parameter that influences the operation or convergence of the rerank model r", + "source_ids": [ + 70 + ], + "source": "Name: rerank model r\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: threshold of gradient g\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "rerank model r", + "tgt_entity_name": "r", + "relation_name": "", + "weight": 10.0, + "description": "r is the variable name for the rerank model", + "source_ids": [ + 70 + ], + "source": "Name: rerank model r\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: r\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "vector search number top k", + "tgt_entity_name": "entity vector database db", + "relation_name": "", + "weight": 9.0, + "description": "the vector search number top k parameter determines the scope of the search performed on the entity vector database db", + "source_ids": [ + 70 + ], + "source": "Name: entity vector database db\nType: DATASET_OR_CORPUS", + "target": "Name: vector search number top k\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "entity vector database db", + "tgt_entity_name": "db", + "relation_name": "", + "weight": 10.0, + "description": "db is the variable name for the entity vector database", + "source_ids": [ + 70 + ], + "source": "Name: entity vector database db\nType: DATASET_OR_CORPUS", + "target": "Name: db\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "vector search number top k", + "tgt_entity_name": "top k", + "relation_name": "", + "weight": 10.0, + "description": "top k is the variable name for the vector search number", + "source_ids": [ + 70 + ], + "source": "Name: vector search number top k\nType: PARAMETER_OR_VARIABLE", + "target": "Name: top k\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "threshold of gradient g", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 10.0, + "description": "g is the variable name for the threshold of gradient", + "source_ids": [ + 70 + ], + "source": "Name: threshold of gradient g\nType: PARAMETER_OR_VARIABLE", + "target": "Name: g\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 10.0, + "description": "kg and g refer to the same knowledge graph entity with g being its variable representation", + "source_ids": [ + 70 + ], + "source": "Name: kg\nType: TASK_OR_PROBLEM", + "target": "Name: g\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "e 9 is processed within the kg context", + "source_ids": [ + 76 + ], + "source": "Name: kg\nType: TASK_OR_PROBLEM", + "target": "Name: e 9\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "v", + "tgt_entity_name": "n", + "relation_name": "", + "weight": 9.0, + "description": "n is a subscript or modifier defining the specific instance of the new entity v", + "source_ids": [ + 70 + ], + "source": "Name: v\nType: TASK_OR_PROBLEM", + "target": "Name: n\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 10.0, + "description": "g is a defined component within the bookindex structure", + "source_ids": [ + 88 + ], + "source": "Name: g\nType: PARAMETER_OR_VARIABLE", + "target": "Name: bookindex\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "vector search", + "tgt_entity_name": "db", + "relation_name": "", + "weight": 9.0, + "description": "vector search operates on the db to find relevant entities", + "source_ids": [ + 71 + ], + "source": "Name: vector search\nType: UNKNOWN", + "target": "Name: db\nType: UNKNOWN" + }, + { + "src_entity_name": "vector search", + "tgt_entity_name": "top k", + "relation_name": "", + "weight": 9.0, + "description": "vector search utilizes the top k parameter to limit the number of relevant entities found", + "source_ids": [ + 71 + ], + "source": "Name: vector search\nType: UNKNOWN", + "target": "Name: top k\nType: UNKNOWN" + }, + { + "src_entity_name": "search", + "tgt_entity_name": "db", + "relation_name": "", + "weight": 9.0, + "description": "the search function is applied to the db to retrieve entities", + "source_ids": [ + 71 + ], + "source": "Name: db\nType: UNKNOWN", + "target": "Name: search\nType: UNKNOWN" + }, + { + "src_entity_name": "search", + "tgt_entity_name": "v n", + "relation_name": "", + "weight": 9.0, + "description": "the search function uses the vector v n as its query input", + "source_ids": [ + 71 + ], + "source": "Name: search\nType: UNKNOWN", + "target": "Name: v n\nType: UNKNOWN" + }, + { + "src_entity_name": "search", + "tgt_entity_name": "e c", + "relation_name": "", + "weight": 9.0, + "description": "the search function outputs the candidate entities e c", + "source_ids": [ + 71 + ], + "source": "Name: search\nType: UNKNOWN", + "target": "Name: e c\nType: UNKNOWN" + }, + { + "src_entity_name": "r", + "tgt_entity_name": "e", + "relation_name": "", + "weight": 8.0, + "description": "the function r takes entities e as input to process them", + "source_ids": [ + 71 + ], + "source": "Name: r\nType: UNKNOWN", + "target": "Name: e\nType: UNKNOWN" + }, + { + "src_entity_name": "r", + "tgt_entity_name": "v cn", + "relation_name": "", + "weight": 8.0, + "description": "the function r uses the vector v cn to calculate rerank scores", + "source_ids": [ + 71 + ], + "source": "Name: r\nType: UNKNOWN", + "target": "Name: v cn\nType: UNKNOWN" + }, + { + "src_entity_name": "sort", + "tgt_entity_name": "e", + "relation_name": "", + "weight": 8.0, + "description": "the sort operation is applied to the list of entities e", + "source_ids": [ + 71 + ], + "source": "Name: e\nType: UNKNOWN", + "target": "Name: sort\nType: UNKNOWN" + }, + { + "src_entity_name": "gradient select", + "tgt_entity_name": "e", + "relation_name": "", + "weight": 8.0, + "description": "gradient select is used to select entities from the remaining list e", + "source_ids": [ + 71 + ], + "source": "Name: e\nType: UNKNOWN", + "target": "Name: gradient select\nType: UNKNOWN" + }, + { + "src_entity_name": "sort", + "tgt_entity_name": "s", + "relation_name": "", + "weight": 9.0, + "description": "the sort operation generates the sorted list s", + "source_ids": [ + 71 + ], + "source": "Name: sort\nType: UNKNOWN", + "target": "Name: s\nType: UNKNOWN" + }, + { + "src_entity_name": "sort", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 9.0, + "description": "the sort operation orders entities based on the rerank scores c", + "source_ids": [ + 71 + ], + "source": "Name: sort\nType: UNKNOWN", + "target": "Name: c\nType: UNKNOWN" + }, + { + "src_entity_name": "gradient select", + "tgt_entity_name": "sel", + "relation_name": "", + "weight": 9.0, + "description": "the gradient select method produces the selected entities sel", + "source_ids": [ + 71 + ], + "source": "Name: gradient select\nType: UNKNOWN", + "target": "Name: sel\nType: UNKNOWN" + }, + { + "src_entity_name": "score", + "tgt_entity_name": "s 0", + "relation_name": "", + "weight": 10.0, + "description": "the score variable is assigned the value of the first element s 0 from the sorted list", + "source_ids": [ + 71 + ], + "source": "Name: score\nType: UNKNOWN", + "target": "Name: s 0\nType: UNKNOWN" + }, + { + "src_entity_name": "case a", + "tgt_entity_name": "new entity", + "relation_name": "", + "weight": 9.0, + "description": "case a describes the scenario where a new entity is introduced and evaluated", + "source_ids": [ + 72 + ], + "source": "Name: case a\nType: TASK_OR_PROBLEM", + "target": "Name: new entity\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "new entity", + "tgt_entity_name": "existing entities", + "relation_name": "", + "weight": 10.0, + "description": "the new entity s relevance scores are calculated against all existing entities", + "source_ids": [ + 72 + ], + "source": "Name: new entity\nType: TASK_OR_PROBLEM", + "target": "Name: existing entities\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "new entity", + "relation_name": "", + "weight": 10.0, + "description": "the entity resolution adjudicator evaluates the new entity to find a match", + "source_ids": [ + 262 + ], + "source": "Name: new entity\nType: TASK_OR_PROBLEM", + "target": "Name: entity resolution adjudicator\nType: PERSON" + }, + { + "src_entity_name": "new entity", + "tgt_entity_name": "candidate entities", + "relation_name": "", + "weight": 9.0, + "description": "the new entity is compared against the candidate entities to determine if they refer to the same concept", + "source_ids": [ + 262 + ], + "source": "Name: new entity\nType: TASK_OR_PROBLEM", + "target": "Name: candidate entities\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "new entity", + "tgt_entity_name": "text", + "relation_name": "", + "weight": 10.0, + "description": "the new entity is extracted from the text", + "source_ids": [ + 262 + ], + "source": "Name: new entity\nType: TASK_OR_PROBLEM", + "target": "Name: text\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "case b", + "tgt_entity_name": "existing entity", + "relation_name": "", + "weight": 10.0, + "description": "case b is defined by the scenario involving an existing entity", + "source_ids": [ + 73 + ], + "source": "Name: case b\nType: TASK_OR_PROBLEM", + "target": "Name: existing entity\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "case b", + "tgt_entity_name": "alias", + "relation_name": "", + "weight": 9.0, + "description": "case b specifically addresses the situation where an alias is being evaluated", + "source_ids": [ + 73 + ], + "source": "Name: case b\nType: TASK_OR_PROBLEM", + "target": "Name: alias\nType: CONCEPT" + }, + { + "src_entity_name": "gradient based er algorithm", + "tgt_entity_name": "case b", + "relation_name": "", + "weight": 10.0, + "description": "the gradient based er algorithm is designed to detect the sharp decline characteristic of case b", + "source_ids": [ + 74 + ], + "source": "Name: case b\nType: TASK_OR_PROBLEM", + "target": "Name: gradient based er algorithm\nType: TECHNOLOGY" + }, + { + "src_entity_name": "reranker", + "tgt_entity_name": "scores", + "relation_name": "", + "weight": 7.0, + "description": "the reranker s limitations affect the initial set of high relevance scores", + "source_ids": [ + 73 + ], + "source": "Name: reranker\nType: TECHNOLOGY", + "target": "Name: scores\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "alias", + "tgt_entity_name": "true match", + "relation_name": "", + "weight": 9.0, + "description": "the alias is evaluated for its relevance to the true match", + "source_ids": [ + 73 + ], + "source": "Name: alias\nType: CONCEPT", + "target": "Name: true match\nType: CONCEPT" + }, + { + "src_entity_name": "scores", + "tgt_entity_name": "true match", + "relation_name": "", + "weight": 8.0, + "description": "scores indicate the relevance of the alias to the true match", + "source_ids": [ + 73 + ], + "source": "Name: scores\nType: EVALUATION_METRIC", + "target": "Name: true match\nType: CONCEPT" + }, + { + "src_entity_name": "scores", + "tgt_entity_name": "equivalent aliases", + "relation_name": "", + "weight": 8.0, + "description": "scores show high relevance to the true match or a set of equivalent aliases", + "source_ids": [ + 73 + ], + "source": "Name: scores\nType: EVALUATION_METRIC", + "target": "Name: equivalent aliases\nType: CONCEPT" + }, + { + "src_entity_name": "scores", + "tgt_entity_name": "gradient", + "relation_name": "", + "weight": 8.0, + "description": "the scores exhibit a sharp decline gradient after the initial high relevance set", + "source_ids": [ + 73 + ], + "source": "Name: scores\nType: EVALUATION_METRIC", + "target": "Name: gradient\nType: MEASUREMENT" + }, + { + "src_entity_name": "gradient", + "tgt_entity_name": "irrelevant entities", + "relation_name": "", + "weight": 7.0, + "description": "the gradient precedes the transition to irrelevant entities", + "source_ids": [ + 73 + ], + "source": "Name: gradient\nType: MEASUREMENT", + "target": "Name: irrelevant entities\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gradient based er algorithm", + "tgt_entity_name": "high relevance set", + "relation_name": "", + "weight": 10.0, + "description": "the gradient based er algorithm efficiently isolates the high relevance set", + "source_ids": [ + 74 + ], + "source": "Name: gradient based er algorithm\nType: TECHNOLOGY", + "target": "Name: high relevance set\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "high relevance set", + "tgt_entity_name": "similar entities", + "relation_name": "", + "weight": 8.0, + "description": "the similar entities are contained within the high relevance set identified by the algorithm", + "source_ids": [ + 74 + ], + "source": "Name: high relevance set\nType: DATASET_OR_CORPUS", + "target": "Name: similar entities\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "score", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to Score", + "source_ids": [ + 178 + ], + "source": "Name: score\nType: PARAMETER_OR_VARIABLE", + "target": "Name: cref='#/texts/259'\nType: IMAGE" + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "e 7", + "relation_name": "", + "weight": 10.0, + "description": "e 9 shows high similarity with e 7 and is merged with it", + "source_ids": [ + 76 + ], + "source": "Name: e 9\nType: TASK_OR_PROBLEM", + "target": "Name: e 7\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "e 6", + "relation_name": "", + "weight": 7.0, + "description": "e 9 shows a sharp decline in similarity with e 6", + "source_ids": [ + 76 + ], + "source": "Name: e 9\nType: TASK_OR_PROBLEM", + "target": "Name: e 6\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "e 8", + "relation_name": "", + "weight": 7.0, + "description": "e 9 shows a sharp decline in similarity with e 8", + "source_ids": [ + 76 + ], + "source": "Name: e 9\nType: TASK_OR_PROBLEM", + "target": "Name: e 8\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "e 5", + "relation_name": "", + "weight": 7.0, + "description": "e 9 shows a sharp decline in similarity with e 5", + "source_ids": [ + 76 + ], + "source": "Name: e 9\nType: TASK_OR_PROBLEM", + "target": "Name: e 5\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "similarity curve", + "tgt_entity_name": "e 9", + "relation_name": "", + "weight": 8.0, + "description": "the similarity curve depicts the similarity of e 9 with other entities", + "source_ids": [ + 76 + ], + "source": "Name: e 9\nType: TASK_OR_PROBLEM", + "target": "Name: similarity curve\nType: IMAGE" + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "unique high confidence match", + "relation_name": "", + "weight": 9.0, + "description": "e 9 is the entity for which the unique high confidence match e 7 is identified", + "source_ids": [ + 76 + ], + "source": "Name: e 9\nType: TASK_OR_PROBLEM", + "target": "Name: unique high confidence match\nType: CONCEPT" + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "consolidated information", + "relation_name": "", + "weight": 8.0, + "description": "the merging of e 9 with e 7 enriches the kg with consolidated information", + "source_ids": [ + 76 + ], + "source": "Name: e 9\nType: TASK_OR_PROBLEM", + "target": "Name: consolidated information\nType: CONCEPT" + }, + { + "src_entity_name": "gradient based selection process", + "tgt_entity_name": "e 7", + "relation_name": "", + "weight": 9.0, + "description": "the gradient based selection process identifies e 7 as the match for e 9", + "source_ids": [ + 76 + ], + "source": "Name: e 7\nType: TASK_OR_PROBLEM", + "target": "Name: gradient based selection process\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "similarity curve", + "tgt_entity_name": "e 7", + "relation_name": "", + "weight": 8.0, + "description": "the similarity curve shows e 9 s high similarity with e 7", + "source_ids": [ + 76 + ], + "source": "Name: e 7\nType: TASK_OR_PROBLEM", + "target": "Name: similarity curve\nType: IMAGE" + }, + { + "src_entity_name": "similarity curve", + "tgt_entity_name": "orange line", + "relation_name": "", + "weight": 10.0, + "description": "the orange line is the visual representation of the similarity curve described in the text", + "source_ids": [ + 76 + ], + "source": "Name: similarity curve\nType: IMAGE", + "target": "Name: orange line\nType: IMAGE" + }, + { + "src_entity_name": "gradient based selection process", + "tgt_entity_name": "unique high confidence match", + "relation_name": "", + "weight": 10.0, + "description": "the gradient based selection process produces the unique high confidence match", + "source_ids": [ + 76 + ], + "source": "Name: gradient based selection process\nType: METHOD_OR_TECHNIQUE", + "target": "Name: unique high confidence match\nType: CONCEPT" + }, + { + "src_entity_name": "kg construction phase", + "tgt_entity_name": "origin tree node", + "relation_name": "", + "weight": 9.0, + "description": "during the kg construction phase origin tree nodes are recorded for newly extracted entities", + "source_ids": [ + 77 + ], + "source": "Name: kg construction phase\nType: TASK_OR_PROBLEM", + "target": "Name: origin tree node\nType: HARDWARE" + }, + { + "src_entity_name": "kg construction phase", + "tgt_entity_name": "v i", + "relation_name": "", + "weight": 9.0, + "description": "the kg construction phase records the origin tree node for every newly extracted entity v i", + "source_ids": [ + 77 + ], + "source": "Name: kg construction phase\nType: TASK_OR_PROBLEM", + "target": "Name: v i\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "origin tree node", + "tgt_entity_name": "v i", + "relation_name": "", + "weight": 9.0, + "description": "an origin tree node is recorded specifically for the entity v i", + "source_ids": [ + 77 + ], + "source": "Name: origin tree node\nType: HARDWARE", + "target": "Name: v i\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "origin tree node", + "tgt_entity_name": "v sel", + "relation_name": "", + "weight": 8.0, + "description": "the origin node set of v sel is updated to include nodes from v n", + "source_ids": [ + 77 + ], + "source": "Name: origin tree node\nType: HARDWARE", + "target": "Name: v sel\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "v sel", + "tgt_entity_name": "v n", + "relation_name": "", + "weight": 8.0, + "description": "v sel is the target entity that receives the origin nodes previously associated with v n", + "source_ids": [ + 77 + ], + "source": "Name: v n\nType: PARAMETER_OR_VARIABLE", + "target": "Name: v sel\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "agent-based query method", + "tgt_entity_name": "5 agent-based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "The 'Agent-Based Query Method' is the primary technical contribution and topic detailed within section 5.", + "source_ids": [ + 78 + ], + "source": "Name: 5 agent-based retrieval\nType: SECTION_TITLE", + "target": "Name: agent-based query method\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "execution trace", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 9.0, + "description": "the execution trace demonstrates agent based planning", + "source_ids": [ + 93 + ], + "source": "Name: agent based planning\nType: METHOD_OR_TECHNIQUE", + "target": "Name: execution trace\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "ift inspired selector reasoner workflow", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 9.0, + "description": "the workflow includes agent based planning which classifies the query", + "source_ids": [ + 157 + ], + "source": "Name: agent based planning\nType: METHOD_OR_TECHNIQUE", + "target": "Name: ift inspired selector reasoner workflow\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "agent based planning", + "tgt_entity_name": "static standard workflow", + "relation_name": "", + "weight": 9.0, + "description": "removing agent based planning results in the system defaulting to a static standard workflow", + "source_ids": [ + 166 + ], + "source": "Name: agent based planning\nType: METHOD_OR_TECHNIQUE", + "target": "Name: static standard workflow\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "structured execution", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 9.0, + "description": "structured execution includes the generation process as part of its workflow", + "source_ids": [ + 79 + ], + "source": "Name: structured execution\nType: METHOD_OR_TECHNIQUE", + "target": "Name: generation\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "real world document queries", + "tgt_entity_name": "modal type filtering", + "relation_name": "", + "weight": 8.0, + "description": "real world document queries necessitate operations like modal type filtering", + "source_ids": [ + 79 + ], + "source": "Name: modal type filtering\nType: METHOD_OR_TECHNIQUE", + "target": "Name: real world document queries\nType: UNKNOWN" + }, + { + "src_entity_name": "real world document queries", + "tgt_entity_name": "semantic selection", + "relation_name": "", + "weight": 8.0, + "description": "real world document queries necessitate operations like semantic selection", + "source_ids": [ + 79 + ], + "source": "Name: semantic selection\nType: METHOD_OR_TECHNIQUE", + "target": "Name: real world document queries\nType: UNKNOWN" + }, + { + "src_entity_name": "real world document queries", + "tgt_entity_name": "multi hop reasoning", + "relation_name": "", + "weight": 8.0, + "description": "real world document queries necessitate operations like multi hop reasoning", + "source_ids": [ + 79 + ], + "source": "Name: multi hop reasoning\nType: METHOD_OR_TECHNIQUE", + "target": "Name: real world document queries\nType: UNKNOWN" + }, + { + "src_entity_name": "figure 3", + "tgt_entity_name": "agent based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "figure 3 illustrates the general workflow of agent based retrieval", + "source_ids": [ + 83 + ], + "source": "Name: figure 3\nType: IMAGE", + "target": "Name: agent based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "figure 3", + "tgt_entity_name": "workflow", + "relation_name": "", + "weight": 10.0, + "description": "figure 3 depicts the general workflow", + "source_ids": [ + 83 + ], + "source": "Name: figure 3\nType: IMAGE", + "target": "Name: workflow\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent based planning", + "tgt_entity_name": "classification plan", + "relation_name": "", + "weight": 9.0, + "description": "agent based planning includes the classification plan stage to distinguish query types", + "source_ids": [ + 82 + ], + "source": "Name: agent based planning\nType: TASK_OR_PROBLEM", + "target": "Name: classification plan\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "agent based planning", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 8.0, + "description": "agent based planning uses a predefined set of operators designed for the bookindex to generate plans", + "source_ids": [ + 82 + ], + "source": "Name: agent based planning\nType: TASK_OR_PROBLEM", + "target": "Name: bookindex\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval contains the agent based planning process", + "source_ids": [ + 83 + ], + "source": "Name: agent based planning\nType: TASK_OR_PROBLEM", + "target": "Name: agent based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "classification plan", + "tgt_entity_name": "transformer", + "relation_name": "", + "weight": 7.0, + "description": "the classification plan stage uses a query comparing transformer and rnns as an example", + "source_ids": [ + 82 + ], + "source": "Name: classification plan\nType: METHOD_OR_TECHNIQUE", + "target": "Name: transformer\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "classification plan", + "tgt_entity_name": "rnns", + "relation_name": "", + "weight": 7.0, + "description": "the classification plan stage uses a query comparing transformer and rnns as an example", + "source_ids": [ + 82 + ], + "source": "Name: classification plan\nType: METHOD_OR_TECHNIQUE", + "target": "Name: rnns\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "operators plan", + "relation_name": "", + "weight": 7.0, + "description": "the operators plan is designed for the bookindex", + "source_ids": [ + 82 + ], + "source": "Name: bookindex\nType: DATASET_OR_CORPUS", + "target": "Name: operators plan\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "retrieval process", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "the retrieval process navigates the bookindex to find information", + "source_ids": [ + 85 + ], + "source": "Name: bookindex\nType: DATASET_OR_CORPUS", + "target": "Name: retrieval process\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "scent filter based retrieval utilizes the bookindex to find information", + "source_ids": [ + 85 + ], + "source": "Name: bookindex\nType: DATASET_OR_CORPUS", + "target": "Name: scent filter based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "t", + "relation_name": "", + "weight": 10.0, + "description": "t is a defined component within the bookindex structure", + "source_ids": [ + 88 + ], + "source": "Name: bookindex\nType: DATASET_OR_CORPUS", + "target": "Name: t\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "relevant entities", + "relation_name": "", + "weight": 8.0, + "description": "relevant entities are contained within the g component of the bookindex", + "source_ids": [ + 85 + ], + "source": "Name: bookindex\nType: DATASET_OR_CORPUS", + "target": "Name: relevant entities\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "operators plan", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 8.0, + "description": "the operators plan guides the retrieval strategy", + "source_ids": [ + 82 + ], + "source": "Name: operators plan\nType: TASK_OR_PROBLEM", + "target": "Name: retrieval\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "operators plan", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 8.0, + "description": "the operators plan guides the generation strategy", + "source_ids": [ + 82 + ], + "source": "Name: operators plan\nType: TASK_OR_PROBLEM", + "target": "Name: generation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval contains the retrieval process", + "source_ids": [ + 83 + ], + "source": "Name: retrieval\nType: TASK_OR_PROBLEM", + "target": "Name: agent based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "layout vanilla", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 8.0, + "description": "layout vanilla preserves essential structural information for better retrieval", + "source_ids": [ + 152 + ], + "source": "Name: retrieval\nType: TASK_OR_PROBLEM", + "target": "Name: layout vanilla\nType: PRODUCT" + }, + { + "src_entity_name": "retrieval error", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 10.0, + "description": "retrieval error is the dominant failure mode associated with the retrieval task", + "source_ids": [ + 185 + ], + "source": "Name: retrieval\nType: TASK_OR_PROBLEM", + "target": "Name: retrieval error\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval contains the generation process", + "source_ids": [ + 83 + ], + "source": "Name: generation\nType: TASK_OR_PROBLEM", + "target": "Name: agent based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "standard reasoning", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 8.0, + "description": "standard reasoning and generation are linked processes denoted as p std", + "source_ids": [ + 115 + ], + "source": "Name: generation\nType: TASK_OR_PROBLEM", + "target": "Name: standard reasoning\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "scent based", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 7.0, + "description": "the scent based path proceeds to generation as part of p std", + "source_ids": [ + 115 + ], + "source": "Name: generation\nType: TASK_OR_PROBLEM", + "target": "Name: scent based\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "section based", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 7.0, + "description": "the section based path proceeds to generation as part of p std", + "source_ids": [ + 115 + ], + "source": "Name: generation\nType: TASK_OR_PROBLEM", + "target": "Name: section based\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "generation", + "tgt_entity_name": "5.3 structured execution", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Generation' is a primary topic of section 5.3.", + "source_ids": [ + 123 + ], + "source": "Name: generation\nType: TASK_OR_PROBLEM", + "target": "Name: 5.3 structured execution\nType: SECTION_TITLE" + }, + { + "src_entity_name": "generation error", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 10.0, + "description": "generation error is the second most common failure mode associated with the generation task", + "source_ids": [ + 185 + ], + "source": "Name: generation\nType: TASK_OR_PROBLEM", + "target": "Name: generation error\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "workflow", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval is described as a general workflow", + "source_ids": [ + 83 + ], + "source": "Name: agent based retrieval\nType: METHOD_OR_TECHNIQUE", + "target": "Name: workflow\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "planning", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval includes planning as a process", + "source_ids": [ + 83 + ], + "source": "Name: agent based retrieval\nType: METHOD_OR_TECHNIQUE", + "target": "Name: planning\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "generation processes", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval includes generation processes as a component", + "source_ids": [ + 83 + ], + "source": "Name: agent based retrieval\nType: METHOD_OR_TECHNIQUE", + "target": "Name: generation processes\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "planning", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Planning", + "source_ids": [ + 94 + ], + "source": "Name: planning\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "cref='#/texts/89'", + "tgt_entity_name": "question", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/89' related to Question", + "source_ids": [ + 84 + ], + "source": "Name: cref='#/texts/89'\nType: IMAGE", + "target": "Name: question\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/89'", + "tgt_entity_name": "agent-based planning", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/89' related to Agent-based Planning", + "source_ids": [ + 84 + ], + "source": "Name: cref='#/texts/89'\nType: IMAGE", + "target": "Name: agent-based planning\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/89'", + "tgt_entity_name": "retrieval process", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/89' related to Retrieval Process", + "source_ids": [ + 84 + ], + "source": "Name: cref='#/texts/89'\nType: IMAGE", + "target": "Name: retrieval process\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/89'", + "tgt_entity_name": "generation process", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/89' related to Generation Process", + "source_ids": [ + 84 + ], + "source": "Name: cref='#/texts/89'\nType: IMAGE", + "target": "Name: generation process\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "question", + "relation_name": "", + "weight": 10.0, + "description": "the single hop task is defined by the ability to answer a question", + "source_ids": [ + 243 + ], + "source": "Name: question\nType: TASK_OR_PROBLEM", + "target": "Name: single hop\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent-based planning", + "tgt_entity_name": "5.2 agent-based planning", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Agent-based Planning' is the primary topic and subject matter of section 5.2.", + "source_ids": [ + 87 + ], + "source": "Name: agent-based planning\nType: METHOD_OR_TECHNIQUE", + "target": "Name: 5.2 agent-based planning\nType: SECTION_TITLE" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "agent-based planning", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Agent-based Planning", + "source_ids": [ + 182 + ], + "source": "Name: agent-based planning\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "retrieval process", + "tgt_entity_name": "5.3 structured execution", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Retrieval Process' is a primary topic of section 5.3.", + "source_ids": [ + 123 + ], + "source": "Name: retrieval process\nType: METHOD_OR_TECHNIQUE", + "target": "Name: 5.3 structured execution\nType: SECTION_TITLE" + }, + { + "src_entity_name": "retrieval process", + "tgt_entity_name": "scent filter based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "the retrieval process executes the scent filter based retrieval method", + "source_ids": [ + 85 + ], + "source": "Name: retrieval process\nType: TASK_OR_PROBLEM", + "target": "Name: scent filter based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "retrieval process", + "tgt_entity_name": "operator plan", + "relation_name": "", + "weight": 10.0, + "description": "the retrieval process is guided by the operator plan", + "source_ids": [ + 85 + ], + "source": "Name: retrieval process\nType: TASK_OR_PROBLEM", + "target": "Name: operator plan\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "modal type", + "relation_name": "", + "weight": 9.0, + "description": "scent filter based retrieval employs modal type as a filter to refine selection", + "source_ids": [ + 85 + ], + "source": "Name: scent filter based retrieval\nType: METHOD_OR_TECHNIQUE", + "target": "Name: modal type\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "relevant entities", + "relation_name": "", + "weight": 9.0, + "description": "scent based retrieval follows relevant entities in g to find information", + "source_ids": [ + 85 + ], + "source": "Name: scent filter based retrieval\nType: METHOD_OR_TECHNIQUE", + "target": "Name: relevant entities\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "operator plan", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Operator Plan", + "source_ids": [ + 94 + ], + "source": "Name: operator plan\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "operator plan", + "relation_name": "", + "weight": 10.0, + "description": "the agent s final task is to generate the operator plan", + "source_ids": [ + 112 + ], + "source": "Name: operator plan\nType: TASK_OR_PROBLEM", + "target": "Name: agent\nType: PERSON" + }, + { + "src_entity_name": "generation process", + "tgt_entity_name": "analysis merging", + "relation_name": "", + "weight": 9.0, + "description": "analysis merging is a sub stage or activity performed within the generation process", + "source_ids": [ + 86 + ], + "source": "Name: generation process\nType: TASK_OR_PROBLEM", + "target": "Name: analysis merging\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "retrieved information", + "tgt_entity_name": "generation process", + "relation_name": "", + "weight": 10.0, + "description": "retrieved information enters the generation process as its primary input", + "source_ids": [ + 86 + ], + "source": "Name: generation process\nType: TASK_OR_PROBLEM", + "target": "Name: retrieved information\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "generation process", + "tgt_entity_name": "coherent response", + "relation_name": "", + "weight": 10.0, + "description": "the generation process formulates the coherent response as its final output", + "source_ids": [ + 86 + ], + "source": "Name: generation process\nType: TASK_OR_PROBLEM", + "target": "Name: coherent response\nType: PRODUCT" + }, + { + "src_entity_name": "fragmented pieces of evidence", + "tgt_entity_name": "analysis merging", + "relation_name": "", + "weight": 9.0, + "description": "analysis merging synthesizes the fragmented pieces of evidence", + "source_ids": [ + 86 + ], + "source": "Name: analysis merging\nType: TASK_OR_PROBLEM", + "target": "Name: fragmented pieces of evidence\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "analysis merging", + "tgt_entity_name": "coherent response", + "relation_name": "", + "weight": 8.0, + "description": "analysis merging contributes to the formulation of the coherent response through final analysis", + "source_ids": [ + 86 + ], + "source": "Name: analysis merging\nType: TASK_OR_PROBLEM", + "target": "Name: coherent response\nType: PRODUCT" + }, + { + "src_entity_name": "formulator", + "tgt_entity_name": "execution pipelines", + "relation_name": "", + "weight": 9.0, + "description": "the formulator operator is combined to form tailored execution pipelines", + "source_ids": [ + 88 + ], + "source": "Name: formulator\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: execution pipelines\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag operator library", + "tgt_entity_name": "formulator", + "relation_name": "", + "weight": 10.0, + "description": "the bookrag operator library contains the formulator operator type", + "source_ids": [ + 93 + ], + "source": "Name: formulator\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: bookrag operator library\nType: SOFTWARE" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "execution pipelines", + "relation_name": "", + "weight": 9.0, + "description": "the selector operator is combined to form tailored execution pipelines", + "source_ids": [ + 88 + ], + "source": "Name: selector\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: execution pipelines\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag operator library", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 10.0, + "description": "the bookrag operator library contains the selector operator type", + "source_ids": [ + 93 + ], + "source": "Name: selector\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: bookrag operator library\nType: SOFTWARE" + }, + { + "src_entity_name": "reasoner", + "tgt_entity_name": "execution pipelines", + "relation_name": "", + "weight": 9.0, + "description": "the reasoner operator is combined to form tailored execution pipelines", + "source_ids": [ + 88 + ], + "source": "Name: reasoner\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: execution pipelines\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag operator library", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 10.0, + "description": "the bookrag operator library contains the reasoner operator type", + "source_ids": [ + 93 + ], + "source": "Name: reasoner\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: bookrag operator library\nType: SOFTWARE" + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "execution pipelines", + "relation_name": "", + "weight": 9.0, + "description": "the synthesizer operator is combined to form tailored execution pipelines", + "source_ids": [ + 88 + ], + "source": "Name: synthesizer\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: execution pipelines\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag operator library", + "tgt_entity_name": "synthesizer", + "relation_name": "", + "weight": 10.0, + "description": "the bookrag operator library contains the synthesizer operator type", + "source_ids": [ + 93 + ], + "source": "Name: synthesizer\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: bookrag operator library\nType: SOFTWARE" + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "agent", + "relation_name": "", + "weight": 9.0, + "description": "the agent employs the bookindex operators for diverse query categories", + "source_ids": [ + 97 + ], + "source": "Name: agent\nType: TASK_OR_PROBLEM", + "target": "Name: bookindex operators\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "query categories", + "relation_name": "", + "weight": 8.0, + "description": "the agent employs operators for diverse query categories", + "source_ids": [ + 97 + ], + "source": "Name: agent\nType: TASK_OR_PROBLEM", + "target": "Name: query categories\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "execution pipelines", + "tgt_entity_name": "adjustable parameters", + "relation_name": "", + "weight": 8.0, + "description": "execution pipelines are created with adjustable parameters", + "source_ids": [ + 88 + ], + "source": "Name: execution pipelines\nType: TASK_OR_PROBLEM", + "target": "Name: adjustable parameters\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "acm computing surveys", + "tgt_entity_name": "6", + "relation_name": "", + "weight": 9.0, + "description": "the paper was published in issue 6 of acm computing surveys", + "source_ids": [ + 202 + ], + "source": "Name: 6\nType: MEASUREMENT", + "target": "Name: acm computing surveys\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "figure 4", + "tgt_entity_name": "bookrag operator library", + "relation_name": "", + "weight": 10.0, + "description": "figure 4 visually depicts the bookrag operator library", + "source_ids": [ + 93 + ], + "source": "Name: figure 4\nType: IMAGE", + "target": "Name: bookrag operator library\nType: SOFTWARE" + }, + { + "src_entity_name": "figure 4", + "tgt_entity_name": "mmlongbench dataset", + "relation_name": "", + "weight": 9.0, + "description": "figure 4 shows an execution example derived from the mmlongbench dataset", + "source_ids": [ + 93 + ], + "source": "Name: figure 4\nType: IMAGE", + "target": "Name: mmlongbench dataset\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "figure 4", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 9.0, + "description": "figure 4 demonstrates an execution trace for a single hop query", + "source_ids": [ + 93 + ], + "source": "Name: figure 4\nType: IMAGE", + "target": "Name: single hop\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 4", + "tgt_entity_name": "execution trace", + "relation_name": "", + "weight": 9.0, + "description": "figure 4 contains an execution trace for a single hop query", + "source_ids": [ + 93 + ], + "source": "Name: figure 4\nType: IMAGE", + "target": "Name: execution trace\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "figure 4", + "relation_name": "", + "weight": 7.0, + "description": "bookindex operators are visually depicted in figure 4", + "source_ids": [ + 97 + ], + "source": "Name: figure 4\nType: IMAGE", + "target": "Name: bookindex operators\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookrag operator library", + "tgt_entity_name": "operator", + "relation_name": "", + "weight": 8.0, + "description": "the bookrag operator library is composed of specific operators", + "source_ids": [ + 93 + ], + "source": "Name: bookrag operator library\nType: SOFTWARE", + "target": "Name: operator\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "execution trace", + "relation_name": "", + "weight": 10.0, + "description": "the execution trace is specifically for a single hop query", + "source_ids": [ + 93 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: execution trace\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 9.0, + "description": "the agent performs the single hop task by attempting to extract an entity", + "source_ids": [ + 115 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: agent\nType: PERSON" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "entity", + "relation_name": "", + "weight": 8.0, + "description": "the single hop task involves the extraction of an entity", + "source_ids": [ + 115 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: entity\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 4 b", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 10.0, + "description": "figure 4 b presents the execution trace for the single hop query", + "source_ids": [ + 135 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: figure 4 b\nType: IMAGE" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 9.0, + "description": "the single hop query asks about the type of car in the example", + "source_ids": [ + 135 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: car\nType: PRODUCT" + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 9.0, + "description": "figure 7 displays the performance breakdown for the single hop query type", + "source_ids": [ + 177 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: figure 7\nType: IMAGE" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "multi hop", + "relation_name": "", + "weight": 5.0, + "description": "both are listed as distinct query types in the performance breakdown", + "source_ids": [ + 177 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: multi hop\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 5.0, + "description": "both are listed as distinct query types in the performance breakdown", + "source_ids": [ + 177 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: global\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent based planning strategy", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 8.0, + "description": "the agent based planning strategy handles single hop queries separately", + "source_ids": [ + 179 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: agent based planning strategy\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "134", + "relation_name": "", + "weight": 8.0, + "description": "the single hop case starts with a reasoning space of 134 nodes", + "source_ids": [ + 186 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: 134\nType: MEASUREMENT" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "24", + "relation_name": "", + "weight": 8.0, + "description": "the single hop case reduces the reasoning space to 24 nodes", + "source_ids": [ + 186 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: 24\nType: MEASUREMENT" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "single", + "relation_name": "", + "weight": 9.0, + "description": "the single hop task requires retrieving information from a single location", + "source_ids": [ + 243 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: single\nType: UNKNOWN" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "contiguous location", + "relation_name": "", + "weight": 8.0, + "description": "the single hop task involves information found in a contiguous location", + "source_ids": [ + 243 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: contiguous location\nType: UNKNOWN" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 7.0, + "description": "the single hop task is defined within the context of a document", + "source_ids": [ + 243 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: document\nType: CONCEPT" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "information", + "relation_name": "", + "weight": 10.0, + "description": "the single hop task involves retrieving information", + "source_ids": [ + 243 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: information\nType: CONCEPT" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "paragraph", + "relation_name": "", + "weight": 9.0, + "description": "a single hop question can be answered by retrieving information from a paragraph", + "source_ids": [ + 243 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: paragraph\nType: SECTION_TITLE" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 9.0, + "description": "a single hop question can be answered by retrieving information from a table", + "source_ids": [ + 243 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: table\nType: SECTION_TITLE" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "figure", + "relation_name": "", + "weight": 9.0, + "description": "a single hop question can be answered by retrieving information from a figure", + "source_ids": [ + 243 + ], + "source": "Name: single hop\nType: TASK_OR_PROBLEM", + "target": "Name: figure\nType: SECTION_TITLE" + }, + { + "src_entity_name": "execution trace", + "tgt_entity_name": "step by step operator execution", + "relation_name": "", + "weight": 9.0, + "description": "the execution trace demonstrates step by step operator execution", + "source_ids": [ + 93 + ], + "source": "Name: execution trace\nType: TASK_OR_PROBLEM", + "target": "Name: step by step operator execution\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "operator-set", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Operator-Set", + "source_ids": [ + 94 + ], + "source": "Name: operator-set\nType: IMAGE", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "extract", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Extract", + "source_ids": [ + 94 + ], + "source": "Name: extract\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "decompose", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Decompose", + "source_ids": [ + 94 + ], + "source": "Name: decompose\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "formulator", + "tgt_entity_name": "decompose", + "relation_name": "", + "weight": 9.0, + "description": "decompose is included as a category within the formulator operators", + "source_ids": [ + 98 + ], + "source": "Name: decompose\nType: METHOD_OR_TECHNIQUE", + "target": "Name: formulator\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "decompose", + "tgt_entity_name": "qs", + "relation_name": "", + "weight": 10.0, + "description": "decompose generates the set of sub queries qs", + "source_ids": [ + 98 + ], + "source": "Name: decompose\nType: METHOD_OR_TECHNIQUE", + "target": "Name: qs\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "decompose", + "tgt_entity_name": "sub queries", + "relation_name": "", + "weight": 10.0, + "description": "decompose produces sub queries as its output", + "source_ids": [ + 98 + ], + "source": "Name: decompose\nType: METHOD_OR_TECHNIQUE", + "target": "Name: sub queries\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "entities", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Entities", + "source_ids": [ + 94 + ], + "source": "Name: entities\nType: DATASET_OR_CORPUS", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "sub-queries", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Sub-queries", + "source_ids": [ + 94 + ], + "source": "Name: sub-queries\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "formulator", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Formulator", + "source_ids": [ + 94 + ], + "source": "Name: formulator\nType: SYSTEM_COMPONENT", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "filter", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Filter", + "source_ids": [ + 94 + ], + "source": "Name: filter\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "select", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Select", + "source_ids": [ + 94 + ], + "source": "Name: select\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Selector", + "source_ids": [ + 94 + ], + "source": "Name: selector\nType: SYSTEM_COMPONENT", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "reason", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Reason", + "source_ids": [ + 94 + ], + "source": "Name: reason\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "graph", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Graph", + "source_ids": [ + 94 + ], + "source": "Name: graph\nType: DATA_STRUCTURE", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "text", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Text", + "source_ids": [ + 94 + ], + "source": "Name: text\nType: DATA_STRUCTURE", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "s:", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to S:", + "source_ids": [ + 94 + ], + "source": "Name: s:\nType: PARAMETER_OR_VARIABLE", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "skyline", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Skyline", + "source_ids": [ + 94 + ], + "source": "Name: skyline\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Reasoner", + "source_ids": [ + 94 + ], + "source": "Name: reasoner\nType: SYSTEM_COMPONENT", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "map", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Map", + "source_ids": [ + 94 + ], + "source": "Name: map\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "map", + "relation_name": "", + "weight": 8.0, + "description": "map is a specific type of operator within the broader category of synthesizer operators responsible for content generation", + "source_ids": [ + 111 + ], + "source": "Name: map\nType: TASK_OR_PROBLEM", + "target": "Name: synthesizer\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "map", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 9.0, + "description": "map and reduce are sequential or related steps in the process of generating a final coherent answer with map generating partial responses and reduce aggregating them", + "source_ids": [ + 111 + ], + "source": "Name: map\nType: TASK_OR_PROBLEM", + "target": "Name: reduce\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "map", + "tgt_entity_name": "analysis", + "relation_name": "", + "weight": 10.0, + "description": "the map operator performs the task of analysis on retrieved information segments", + "source_ids": [ + 111 + ], + "source": "Name: map\nType: TASK_OR_PROBLEM", + "target": "Name: analysis\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "map", + "tgt_entity_name": "partial responses", + "relation_name": "", + "weight": 10.0, + "description": "map generates partial responses as its output", + "source_ids": [ + 111 + ], + "source": "Name: map\nType: TASK_OR_PROBLEM", + "target": "Name: partial responses\nType: PRODUCT" + }, + { + "src_entity_name": "map", + "tgt_entity_name": "retrieved information segments", + "relation_name": "", + "weight": 10.0, + "description": "map performs analysis specifically on retrieved information segments", + "source_ids": [ + 111 + ], + "source": "Name: map\nType: TASK_OR_PROBLEM", + "target": "Name: retrieved information segments\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Reduce", + "source_ids": [ + 94 + ], + "source": "Name: reduce\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 8.0, + "description": "reduce is a specific type of operator within the broader category of synthesizer operators responsible for content generation", + "source_ids": [ + 111 + ], + "source": "Name: reduce\nType: TASK_OR_PROBLEM", + "target": "Name: synthesizer\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "reduce", + "tgt_entity_name": "final coherent answer", + "relation_name": "", + "weight": 10.0, + "description": "reduce synthesizes a final coherent answer as its output", + "source_ids": [ + 111 + ], + "source": "Name: reduce\nType: TASK_OR_PROBLEM", + "target": "Name: final coherent answer\nType: PRODUCT" + }, + { + "src_entity_name": "reduce", + "tgt_entity_name": "multiple sources", + "relation_name": "", + "weight": 10.0, + "description": "reduce aggregates information from multiple sources to create its output", + "source_ids": [ + 111 + ], + "source": "Name: reduce\nType: TASK_OR_PROBLEM", + "target": "Name: multiple sources\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "reduce", + "tgt_entity_name": "partial answers", + "relation_name": "", + "weight": 9.0, + "description": "reduce aggregates partial answers as part of its synthesis process", + "source_ids": [ + 111 + ], + "source": "Name: reduce\nType: TASK_OR_PROBLEM", + "target": "Name: partial answers\nType: PRODUCT" + }, + { + "src_entity_name": "reduce", + "tgt_entity_name": "retrieved evidence", + "relation_name": "", + "weight": 9.0, + "description": "reduce aggregates retrieved evidence as part of its synthesis process", + "source_ids": [ + 111 + ], + "source": "Name: reduce\nType: TASK_OR_PROBLEM", + "target": "Name: retrieved evidence\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "synthesizer", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Synthesizer", + "source_ids": [ + 94 + ], + "source": "Name: synthesizer\nType: SYSTEM_COMPONENT", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "execution example", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Execution example", + "source_ids": [ + 94 + ], + "source": "Name: execution example\nType: SECTION_TITLE", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "q: what is the type of car in the ranking prompt example?", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Q: What is the type of car in the Ranking Prompt example?", + "source_ids": [ + 94 + ], + "source": "Name: q: what is the type of car in the ranking prompt example?\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "simple query...", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Simple query...", + "source_ids": [ + 94 + ], + "source": "Name: simple query...\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Car", + "source_ids": [ + 94 + ], + "source": "Name: car\nType: PRODUCT", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "car", + "tgt_entity_name": "ranking prompt example", + "relation_name": "", + "weight": 8.0, + "description": "the car is the subject of the query within the ranking prompt example context", + "source_ids": [ + 135 + ], + "source": "Name: car\nType: PRODUCT", + "target": "Name: ranking prompt example\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 9.0, + "description": "the extract method is used to identify the entity car", + "source_ids": [ + 135 + ], + "source": "Name: car\nType: PRODUCT", + "target": "Name: extract\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "select by entity", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 8.0, + "description": "the select by entity method retrieves nodes related to the identified entity car", + "source_ids": [ + 135 + ], + "source": "Name: car\nType: PRODUCT", + "target": "Name: select by entity\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "skyline filtering", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 7.0, + "description": "the skyline filtering technique refines the nodes related to car", + "source_ids": [ + 135 + ], + "source": "Name: car\nType: PRODUCT", + "target": "Name: skyline filtering\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "reduce", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 7.0, + "description": "the reduce method synthesizes the answer regarding the car", + "source_ids": [ + 135 + ], + "source": "Name: car\nType: PRODUCT", + "target": "Name: reduce\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "ranking prompt", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Ranking Prompt", + "source_ids": [ + 94 + ], + "source": "Name: ranking prompt\nType: BOOK", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "method", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Method", + "source_ids": [ + 94 + ], + "source": "Name: method\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "method and its descendants", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Method and its Descendants", + "source_ids": [ + 94 + ], + "source": "Name: method and its descendants\nType: SECTION_TITLE", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "a: based on the provided information...", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to A: Based on the provided information...", + "source_ids": [ + 94 + ], + "source": "Name: a: based on the provided information...\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "mercedes-benz e-class sedan", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Mercedes-Benz E-Class Sedan", + "source_ids": [ + 94 + ], + "source": "Name: mercedes-benz e-class sedan\nType: VEHICLE", + "target": "Name: image cref='#/texts/98'\nType: UNKNOWN" + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "operator plan", + "relation_name": "", + "weight": 9.0, + "description": "query classification is performed to generate a specific operator plan", + "source_ids": [ + 95 + ], + "source": "Name: query classification\nType: TASK_OR_PROBLEM", + "target": "Name: operator plan\nType: PRODUCT" + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 9.0, + "description": "query classification defines single hop as one of its three representative categories", + "source_ids": [ + 96 + ], + "source": "Name: query classification\nType: TASK_OR_PROBLEM", + "target": "Name: single hop\nType: EVENT" + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "multi hop", + "relation_name": "", + "weight": 9.0, + "description": "query classification defines multi hop as one of its three representative categories", + "source_ids": [ + 96 + ], + "source": "Name: query classification\nType: TASK_OR_PROBLEM", + "target": "Name: multi hop\nType: EVENT" + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "global aggregation", + "relation_name": "", + "weight": 9.0, + "description": "query classification defines global aggregation as one of its three representative categories", + "source_ids": [ + 96 + ], + "source": "Name: query classification\nType: TASK_OR_PROBLEM", + "target": "Name: global aggregation\nType: EVENT" + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "agent strategy selection", + "relation_name": "", + "weight": 9.0, + "description": "query classification enables agent strategy selection", + "source_ids": [ + 96 + ], + "source": "Name: query classification\nType: TASK_OR_PROBLEM", + "target": "Name: agent strategy selection\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "solution strategy", + "relation_name": "", + "weight": 8.0, + "description": "each category defined by query classification requires a different solution strategy", + "source_ids": [ + 96 + ], + "source": "Name: query classification\nType: TASK_OR_PROBLEM", + "target": "Name: solution strategy\nType: CONCEPT" + }, + { + "src_entity_name": "figure 10", + "tgt_entity_name": "query classification", + "relation_name": "", + "weight": 10.0, + "description": "figure 10 contains the prompt specifically used for query classification", + "source_ids": [ + 253 + ], + "source": "Name: query classification\nType: TASK_OR_PROBLEM", + "target": "Name: figure 10\nType: IMAGE" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "scent based retrieval", + "relation_name": "", + "weight": 8.0, + "description": "single hop queries typically require a scent based retrieval operation", + "source_ids": [ + 96 + ], + "source": "Name: single hop\nType: EVENT", + "target": "Name: scent based retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "intrinsic complexity", + "relation_name": "", + "weight": 7.0, + "description": "single hop is defined by its intrinsic complexity and operational demands", + "source_ids": [ + 96 + ], + "source": "Name: single hop\nType: EVENT", + "target": "Name: intrinsic complexity\nType: CONCEPT" + }, + { + "src_entity_name": "multi hop", + "tgt_entity_name": "intrinsic complexity", + "relation_name": "", + "weight": 7.0, + "description": "multi hop is defined by its intrinsic complexity and operational demands", + "source_ids": [ + 96 + ], + "source": "Name: multi hop\nType: EVENT", + "target": "Name: intrinsic complexity\nType: CONCEPT" + }, + { + "src_entity_name": "global aggregation", + "tgt_entity_name": "filter aggregation", + "relation_name": "", + "weight": 8.0, + "description": "global aggregation queries usually involve a sequence of filter aggregation operations", + "source_ids": [ + 96 + ], + "source": "Name: global aggregation\nType: EVENT", + "target": "Name: filter aggregation\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "global aggregation", + "tgt_entity_name": "intrinsic complexity", + "relation_name": "", + "weight": 7.0, + "description": "global aggregation is defined by its intrinsic complexity and operational demands", + "source_ids": [ + 96 + ], + "source": "Name: global aggregation\nType: EVENT", + "target": "Name: intrinsic complexity\nType: CONCEPT" + }, + { + "src_entity_name": "global aggregation", + "tgt_entity_name": "filtering conditions", + "relation_name": "", + "weight": 8.0, + "description": "global aggregation necessitates analyzing content under multiple filtering conditions", + "source_ids": [ + 96 + ], + "source": "Name: global aggregation\nType: EVENT", + "target": "Name: filtering conditions\nType: CONCEPT" + }, + { + "src_entity_name": "global aggregation", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 7.0, + "description": "global aggregation involves analyzing content across various parts of the document", + "source_ids": [ + 96 + ], + "source": "Name: global aggregation\nType: EVENT", + "target": "Name: document\nType: OBJECT" + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "o", + "relation_name": "", + "weight": 9.0, + "description": "bookindex operators are represented by the set o tailored for the bookindex", + "source_ids": [ + 97 + ], + "source": "Name: bookindex operators\nType: TASK_OR_PROBLEM", + "target": "Name: o\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "figure 4 a", + "relation_name": "", + "weight": 8.0, + "description": "bookindex operators are visually depicted in figure 4 a", + "source_ids": [ + 97 + ], + "source": "Name: bookindex operators\nType: TASK_OR_PROBLEM", + "target": "Name: figure 4 a\nType: IMAGE" + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "table 3", + "relation_name": "", + "weight": 8.0, + "description": "bookindex operators are detailed in table 3", + "source_ids": [ + 97 + ], + "source": "Name: bookindex operators\nType: TASK_OR_PROBLEM", + "target": "Name: table 3\nType: TABLE" + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "classification", + "relation_name": "", + "weight": 9.0, + "description": "bookindex operators are designed to execute strategies identified by classification", + "source_ids": [ + 97 + ], + "source": "Name: bookindex operators\nType: TASK_OR_PROBLEM", + "target": "Name: classification\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "formulator", + "tgt_entity_name": "extract", + "relation_name": "", + "weight": 9.0, + "description": "extract is included as a category within the formulator operators", + "source_ids": [ + 98 + ], + "source": "Name: formulator\nType: TASK_OR_PROBLEM", + "target": "Name: extract\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "eq", + "relation_name": "", + "weight": 10.0, + "description": "extract identifies the key entities eq", + "source_ids": [ + 98 + ], + "source": "Name: extract\nType: METHOD_OR_TECHNIQUE", + "target": "Name: eq\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "query text", + "relation_name": "", + "weight": 10.0, + "description": "extract analyzes the query text to find key entities", + "source_ids": [ + 98 + ], + "source": "Name: extract\nType: METHOD_OR_TECHNIQUE", + "target": "Name: query text\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "entities", + "relation_name": "", + "weight": 10.0, + "description": "extract identifies entities from the query text", + "source_ids": [ + 98 + ], + "source": "Name: extract\nType: METHOD_OR_TECHNIQUE", + "target": "Name: entities\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "extract", + "relation_name": "", + "weight": 9.0, + "description": "the agent uses the extract method to identify key entities", + "source_ids": [ + 135 + ], + "source": "Name: extract\nType: METHOD_OR_TECHNIQUE", + "target": "Name: agent\nType: PERSON" + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "select by entity", + "relation_name": "", + "weight": 7.0, + "description": "the extract method precedes the select by entity method in the workflow", + "source_ids": [ + 135 + ], + "source": "Name: extract\nType: METHOD_OR_TECHNIQUE", + "target": "Name: select by entity\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "qs", + "tgt_entity_name": "pdec", + "relation_name": "", + "weight": 7.0, + "description": "qs is generated using the parameter pdec in the llm function", + "source_ids": [ + 98 + ], + "source": "Name: qs\nType: TASK_OR_PROBLEM", + "target": "Name: pdec\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "eq", + "tgt_entity_name": "pext", + "relation_name": "", + "weight": 7.0, + "description": "eq is generated using the parameter pext in the llm function", + "source_ids": [ + 98 + ], + "source": "Name: eq\nType: TASK_OR_PROBLEM", + "target": "Name: pext\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "p dec", + "tgt_entity_name": "decomposition", + "relation_name": "", + "weight": 10.0, + "description": "p dec is the specific prompt used to guide the llm for the decomposition task", + "source_ids": [ + 101 + ], + "source": "Name: p dec\nType: SOFTWARE", + "target": "Name: decomposition\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "p ext", + "tgt_entity_name": "extraction", + "relation_name": "", + "weight": 10.0, + "description": "p ext is the specific prompt used to guide the llm for the extraction task", + "source_ids": [ + 101 + ], + "source": "Name: p ext\nType: SOFTWARE", + "target": "Name: extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 13", + "tgt_entity_name": "prompt", + "relation_name": "", + "weight": 10.0, + "description": "figure 13 displays the prompt for entity resolution judgement", + "source_ids": [ + 284 + ], + "source": "Name: prompt\nType: SOFTWARE", + "target": "Name: figure 13\nType: IMAGE" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "n f", + "relation_name": "", + "weight": 9.0, + "description": "the selector operators produce the filtered subset n f", + "source_ids": [ + 102 + ], + "source": "Name: selector\nType: TECHNOLOGY", + "target": "Name: n f\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "reasoners", + "relation_name": "", + "weight": 9.0, + "description": "the removal of selector operators forces reasoners to score all candidate nodes indicating a direct operational dependency", + "source_ids": [ + 167 + ], + "source": "Name: selector\nType: TECHNOLOGY", + "target": "Name: reasoners\nType: TECHNOLOGY" + }, + { + "src_entity_name": "filter modal", + "tgt_entity_name": "n f", + "relation_name": "", + "weight": 8.0, + "description": "filter modal contributes to the production of the filtered subset n f", + "source_ids": [ + 102 + ], + "source": "Name: filter modal\nType: TECHNOLOGY", + "target": "Name: n f\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "filter range", + "tgt_entity_name": "n f", + "relation_name": "", + "weight": 8.0, + "description": "filter range contributes to the production of the filtered subset n f", + "source_ids": [ + 102 + ], + "source": "Name: filter range\nType: TECHNOLOGY", + "target": "Name: n f\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "n f", + "tgt_entity_name": "c n", + "relation_name": "", + "weight": 9.0, + "description": "the filtered subset n f consists of nodes where the predicate c n holds true", + "source_ids": [ + 102 + ], + "source": "Name: n f\nType: PARAMETER_OR_VARIABLE", + "target": "Name: c n\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "n f", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "the filtered subset n f is a subset of the nodes n", + "source_ids": [ + 102 + ], + "source": "Name: n f\nType: PARAMETER_OR_VARIABLE", + "target": "Name: nodes\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "c n", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "the predicate c n is evaluated for each node in the set", + "source_ids": [ + 102 + ], + "source": "Name: c n\nType: PARAMETER_OR_VARIABLE", + "target": "Name: nodes\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "plan error", + "tgt_entity_name": "plan", + "relation_name": "", + "weight": 10.0, + "description": "plan error is a specific failure pattern occurring within the plan task", + "source_ids": [ + 185 + ], + "source": "Name: plan\nType: TASK_OR_PROBLEM", + "target": "Name: plan error\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "select by entity", + "tgt_entity_name": "s target", + "relation_name": "", + "weight": 9.0, + "description": "select by entity identifies a set of target section nodes s target as part of its process", + "source_ids": [ + 104 + ], + "source": "Name: select by entity\nType: TECHNOLOGY", + "target": "Name: s target\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "select by entity", + "tgt_entity_name": "subtree", + "relation_name": "", + "weight": 9.0, + "description": "select by entity retrieves subtrees rooted at specific section nodes", + "source_ids": [ + 104 + ], + "source": "Name: select by entity\nType: TECHNOLOGY", + "target": "Name: subtree\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "select by section", + "tgt_entity_name": "s target", + "relation_name": "", + "weight": 9.0, + "description": "select by section identifies a set of target section nodes s target as part of its process", + "source_ids": [ + 104 + ], + "source": "Name: select by section\nType: TECHNOLOGY", + "target": "Name: s target\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "select by section", + "tgt_entity_name": "subtree", + "relation_name": "", + "weight": 9.0, + "description": "select by section retrieves subtrees rooted at specific section nodes", + "source_ids": [ + 104 + ], + "source": "Name: select by section\nType: TECHNOLOGY", + "target": "Name: subtree\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "s target", + "tgt_entity_name": "e q", + "relation_name": "", + "weight": 8.0, + "description": "s target consists of sections linked to entities e q via gt link", + "source_ids": [ + 104 + ], + "source": "Name: s target\nType: TASK_OR_PROBLEM", + "target": "Name: e q\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "s target", + "tgt_entity_name": "n s", + "relation_name": "", + "weight": 9.0, + "description": "n s is formed by retrieving all descendants of the target section nodes s target", + "source_ids": [ + 104 + ], + "source": "Name: s target\nType: TASK_OR_PROBLEM", + "target": "Name: n s\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "s target", + "tgt_entity_name": "section node", + "relation_name": "", + "weight": 10.0, + "description": "s target consists of specific section nodes", + "source_ids": [ + 104 + ], + "source": "Name: s target\nType: TASK_OR_PROBLEM", + "target": "Name: section node\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "s target", + "tgt_entity_name": "depth", + "relation_name": "", + "weight": 8.0, + "description": "s target is defined at a specified depth", + "source_ids": [ + 104 + ], + "source": "Name: s target\nType: TASK_OR_PROBLEM", + "target": "Name: depth\nType: MEASUREMENT" + }, + { + "src_entity_name": "n s", + "tgt_entity_name": "descendant", + "relation_name": "", + "weight": 9.0, + "description": "n s is formed by retrieving all descendants of the target sections", + "source_ids": [ + 104 + ], + "source": "Name: n s\nType: TASK_OR_PROBLEM", + "target": "Name: descendant\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "reasoner", + "tgt_entity_name": "selected tree nodes", + "relation_name": "", + "weight": 9.0, + "description": "reasoner analyzes and refines selected tree nodes", + "source_ids": [ + 106 + ], + "source": "Name: reasoner\nType: TASK_OR_PROBLEM", + "target": "Name: selected tree nodes\nType: UNKNOWN" + }, + { + "src_entity_name": "graph reasoning", + "tgt_entity_name": "subgraph", + "relation_name": "", + "weight": 10.0, + "description": "graph reasoning performs multi hop inference on a subgraph", + "source_ids": [ + 106 + ], + "source": "Name: graph reasoning\nType: METHOD_OR_TECHNIQUE", + "target": "Name: subgraph\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "graph reasoning", + "tgt_entity_name": "entity", + "relation_name": "", + "weight": 9.0, + "description": "graph reasoning starts its inference process from an entity", + "source_ids": [ + 106 + ], + "source": "Name: graph reasoning\nType: METHOD_OR_TECHNIQUE", + "target": "Name: entity\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "graph reasoning", + "tgt_entity_name": "pagerank algorithm", + "relation_name": "", + "weight": 10.0, + "description": "graph reasoning uses the pagerank algorithm to compute the entity importance vector", + "source_ids": [ + 106 + ], + "source": "Name: graph reasoning\nType: METHOD_OR_TECHNIQUE", + "target": "Name: pagerank algorithm\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "pagerank algorithm", + "tgt_entity_name": "entity importance vector", + "relation_name": "", + "weight": 10.0, + "description": "the pagerank algorithm computes the entity importance vector", + "source_ids": [ + 106 + ], + "source": "Name: pagerank algorithm\nType: METHOD_OR_TECHNIQUE", + "target": "Name: entity importance vector\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "entity importance vector", + "tgt_entity_name": "gt link matrix", + "relation_name": "", + "weight": 9.0, + "description": "the entity importance vector is mapped to tree nodes via the gt link matrix", + "source_ids": [ + 106 + ], + "source": "Name: gt link matrix\nType: SOFTWARE", + "target": "Name: entity importance vector\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "gt link matrix", + "tgt_entity_name": "tree node importance scores vector", + "relation_name": "", + "weight": 9.0, + "description": "the gt link matrix is used to derive the tree node importance scores vector", + "source_ids": [ + 106 + ], + "source": "Name: gt link matrix\nType: SOFTWARE", + "target": "Name: tree node importance scores vector\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "subgraph", + "tgt_entity_name": "selected nodes", + "relation_name": "", + "weight": 10.0, + "description": "the subgraph is extracted from selected nodes", + "source_ids": [ + 106 + ], + "source": "Name: subgraph\nType: TASK_OR_PROBLEM", + "target": "Name: selected nodes\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "entity", + "relation_name": "", + "weight": 9.0, + "description": "the agent attempts to extract the entity as the first step of the single hop process", + "source_ids": [ + 115 + ], + "source": "Name: entity\nType: TASK_OR_PROBLEM", + "target": "Name: agent\nType: PERSON" + }, + { + "src_entity_name": "text ranker", + "tgt_entity_name": "query", + "relation_name": "", + "weight": 9.0, + "description": "text ranker evaluates the relevance of content specifically to the query", + "source_ids": [ + 109 + ], + "source": "Name: text ranker\nType: SOFTWARE", + "target": "Name: query\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "text ranker", + "tgt_entity_name": "relevance score", + "relation_name": "", + "weight": 10.0, + "description": "text ranker assigns a relevance score to each tree node", + "source_ids": [ + 109 + ], + "source": "Name: text ranker\nType: SOFTWARE", + "target": "Name: relevance score\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "text ranker", + "tgt_entity_name": "tree node", + "relation_name": "", + "weight": 9.0, + "description": "text ranker evaluates the content of the tree node", + "source_ids": [ + 109 + ], + "source": "Name: text ranker\nType: SOFTWARE", + "target": "Name: tree node\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "text ranker", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "text ranker evaluates the content of the nodes", + "source_ids": [ + 109 + ], + "source": "Name: text ranker\nType: SOFTWARE", + "target": "Name: nodes\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "skyline operator", + "relation_name": "", + "weight": 10.0, + "description": "skyline ranker employs the skyline operator to perform its filtering function", + "source_ids": [ + 109 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: skyline operator\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "tree node", + "relation_name": "", + "weight": 9.0, + "description": "skyline ranker filters tree nodes based on scoring dimensions", + "source_ids": [ + 109 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: tree node\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "relevance score", + "relation_name": "", + "weight": 8.0, + "description": "skyline ranker uses relevance scores along with others to filter nodes", + "source_ids": [ + 109 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: relevance score\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 10.0, + "description": "skyline ranker filters the nodes based on the specified scoring dimensions", + "source_ids": [ + 109 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: nodes\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "9 87", + "relation_name": "", + "weight": 8.0, + "description": "the skyline ranker process results in an average of 9 87 retained nodes on one dataset", + "source_ids": [ + 157 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: 9 87\nType: MEASUREMENT" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "6 86", + "relation_name": "", + "weight": 8.0, + "description": "the skyline ranker process results in an average of 6 86 retained nodes on another dataset", + "source_ids": [ + 157 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: 6 86\nType: MEASUREMENT" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "8 6", + "relation_name": "", + "weight": 8.0, + "description": "the skyline ranker process results in an average of 8 6 retained nodes on the third dataset", + "source_ids": [ + 157 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: 8 6\nType: MEASUREMENT" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "10", + "relation_name": "", + "weight": 7.0, + "description": "the number of retained nodes by skyline ranker is comparable to the standard top k setting where k 10", + "source_ids": [ + 157 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: 10\nType: MEASUREMENT" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "candidate size", + "relation_name": "", + "weight": 8.0, + "description": "the skyline ranker process ensures high quality retrieval without inflating the candidate size", + "source_ids": [ + 157 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: candidate size\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "three datasets", + "relation_name": "", + "weight": 9.0, + "description": "the average number of retained nodes by skyline ranker is measured across three datasets", + "source_ids": [ + 157 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: three datasets\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "standard top k setting", + "relation_name": "", + "weight": 8.0, + "description": "the results of the skyline ranker process are compared to the standard top k setting", + "source_ids": [ + 157 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: standard top k setting\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "graph reasoning", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 10.0, + "description": "the graph reasoning operator enables the skyline ranker removing it disables the skyline ranker", + "source_ids": [ + 168 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: graph reasoning\nType: TECHNOLOGY" + }, + { + "src_entity_name": "text reasoning", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 9.0, + "description": "the removal of the text reasoning operator causes the skyline ranker to be disabled", + "source_ids": [ + 169 + ], + "source": "Name: skyline ranker\nType: SOFTWARE", + "target": "Name: text reasoning\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "tree node", + "relation_name": "", + "weight": 8.0, + "description": "the skyline operator is used to filter tree nodes", + "source_ids": [ + 109 + ], + "source": "Name: skyline operator\nType: METHOD_OR_TECHNIQUE", + "target": "Name: tree node\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "the skyline operator filters the nodes", + "source_ids": [ + 109 + ], + "source": "Name: skyline operator\nType: METHOD_OR_TECHNIQUE", + "target": "Name: nodes\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "query", + "relation_name": "", + "weight": 9.0, + "description": "the agent classifies the query into a category", + "source_ids": [ + 112 + ], + "source": "Name: query\nType: TASK_OR_PROBLEM", + "target": "Name: agent\nType: PERSON" + }, + { + "src_entity_name": "query", + "tgt_entity_name": "parameters", + "relation_name": "", + "weight": 8.0, + "description": "parameters are dynamically instantiated based on the query", + "source_ids": [ + 112 + ], + "source": "Name: query\nType: TASK_OR_PROBLEM", + "target": "Name: parameters\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "ift inspired selector reasoner workflow", + "tgt_entity_name": "query", + "relation_name": "", + "weight": 8.0, + "description": "the workflow s agent based planning component classifies the query", + "source_ids": [ + 157 + ], + "source": "Name: query\nType: TASK_OR_PROBLEM", + "target": "Name: ift inspired selector reasoner workflow\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "content generation", + "relation_name": "", + "weight": 10.0, + "description": "synthesizer operators are responsible for the task of content generation", + "source_ids": [ + 111 + ], + "source": "Name: synthesizer\nType: TASK_OR_PROBLEM", + "target": "Name: content generation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "partial responses", + "tgt_entity_name": "final coherent answer", + "relation_name": "", + "weight": 9.0, + "description": "partial responses generated by map are aggregated by reduce to form the final coherent answer", + "source_ids": [ + 111 + ], + "source": "Name: partial responses\nType: PRODUCT", + "target": "Name: final coherent answer\nType: PRODUCT" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "category", + "relation_name": "", + "weight": 9.0, + "description": "the agent uses the category derived from the query to generate the plan", + "source_ids": [ + 112 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: category\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "library", + "relation_name": "", + "weight": 8.0, + "description": "the agent selects operators from the library to form the plan", + "source_ids": [ + 112 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: library\nType: ORGANIZATION" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "operators", + "relation_name": "", + "weight": 9.0, + "description": "the agent selects a specific sequence of operators to create the plan", + "source_ids": [ + 112 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: operators\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "scent based", + "relation_name": "", + "weight": 8.0, + "description": "the agent executes the scent based selection strategy if entity extraction is successful", + "source_ids": [ + 115 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: scent based\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "section based", + "relation_name": "", + "weight": 8.0, + "description": "the agent falls back to the section based strategy if entity extraction fails", + "source_ids": [ + 115 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: section based\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "complex", + "relation_name": "", + "weight": 9.0, + "description": "the agent decomposes the complex problem into sub problems", + "source_ids": [ + 118 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: complex\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "single hop workflow", + "relation_name": "", + "weight": 10.0, + "description": "the agent applies the single hop workflow to each sub problem", + "source_ids": [ + 118 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: single hop workflow\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "planning phase", + "tgt_entity_name": "agent", + "relation_name": "", + "weight": 9.0, + "description": "the planning phase is conducted by the agent", + "source_ids": [ + 135 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: planning phase\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "select by entity", + "relation_name": "", + "weight": 9.0, + "description": "the agent uses the select by entity method to retrieve relevant nodes", + "source_ids": [ + 135 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: select by entity\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "reasoning", + "relation_name": "", + "weight": 8.0, + "description": "the agent applies reasoning to refine nodes", + "source_ids": [ + 135 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: reasoning\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "skyline filtering", + "relation_name": "", + "weight": 8.0, + "description": "the agent applies skyline filtering to refine nodes", + "source_ids": [ + 135 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: skyline filtering\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 9.0, + "description": "the agent uses the reduce method to synthesize the answer", + "source_ids": [ + 135 + ], + "source": "Name: agent\nType: PERSON", + "target": "Name: reduce\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "parameters", + "tgt_entity_name": "operators", + "relation_name": "", + "weight": 8.0, + "description": "parameters are dynamically instantiated for the operators", + "source_ids": [ + 112 + ], + "source": "Name: operators\nType: TASK_OR_PROBLEM", + "target": "Name: parameters\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "equation 8", + "tgt_entity_name": "agent plan", + "relation_name": "", + "weight": 10.0, + "description": "equation 8 utilizes the agent plan function", + "source_ids": [ + 112 + ], + "source": "Name: agent plan\nType: METHOD_OR_TECHNIQUE", + "target": "Name: equation 8\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "the plan", + "tgt_entity_name": "workflow", + "relation_name": "", + "weight": 9.0, + "description": "the plan follows a structured workflow", + "source_ids": [ + 114 + ], + "source": "Name: the plan\nType: TASK_OR_PROBLEM", + "target": "Name: workflow\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "the plan", + "tgt_entity_name": "category", + "relation_name": "", + "weight": 8.0, + "description": "the plan s workflow is tailored to each category", + "source_ids": [ + 114 + ], + "source": "Name: the plan\nType: TASK_OR_PROBLEM", + "target": "Name: category\nType: CONCEPT" + }, + { + "src_entity_name": "scent based", + "tgt_entity_name": "standard reasoning", + "relation_name": "", + "weight": 7.0, + "description": "the scent based path proceeds to standard reasoning and generation", + "source_ids": [ + 115 + ], + "source": "Name: scent based\nType: METHOD_OR_TECHNIQUE", + "target": "Name: standard reasoning\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "section based", + "tgt_entity_name": "standard reasoning", + "relation_name": "", + "weight": 7.0, + "description": "the section based path proceeds to standard reasoning and generation", + "source_ids": [ + 115 + ], + "source": "Name: section based\nType: METHOD_OR_TECHNIQUE", + "target": "Name: standard reasoning\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "single hop workflow", + "tgt_entity_name": "ps", + "relation_name": "", + "weight": 10.0, + "description": "the single hop workflow is identified by the notation ps in the text", + "source_ids": [ + 118 + ], + "source": "Name: single hop workflow\nType: METHOD_OR_TECHNIQUE", + "target": "Name: ps\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "expert query analyzer", + "tgt_entity_name": "complex", + "relation_name": "", + "weight": 9.0, + "description": "the expert query analyzer classifies questions into the complex category", + "source_ids": [ + 241 + ], + "source": "Name: complex\nType: TASK_OR_PROBLEM", + "target": "Name: expert query analyzer\nType: PERSON" + }, + { + "src_entity_name": "agent based planning strategy", + "tgt_entity_name": "global aggregation", + "relation_name": "", + "weight": 8.0, + "description": "the agent based planning strategy handles global aggregation queries separately", + "source_ids": [ + 179 + ], + "source": "Name: global aggregation\nType: TASK_OR_PROBLEM", + "target": "Name: agent based planning strategy\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "modal filter", + "tgt_entity_name": "nested composition", + "relation_name": "", + "weight": 8.0, + "description": "modal filters are applied as part of the nested composition process", + "source_ids": [ + 122 + ], + "source": "Name: modal filter\nType: TECHNOLOGY", + "target": "Name: nested composition\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "range filter", + "tgt_entity_name": "nested composition", + "relation_name": "", + "weight": 8.0, + "description": "range filters are applied as part of the nested composition process", + "source_ids": [ + 122 + ], + "source": "Name: range filter\nType: TECHNOLOGY", + "target": "Name: nested composition\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "ift principles", + "tgt_entity_name": "5.3 structured execution", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'IFT Principles' is a primary topic of section 5.3.", + "source_ids": [ + 123 + ], + "source": "Name: 5.3 structured execution\nType: SECTION_TITLE", + "target": "Name: ift principles\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "processed evidence", + "relation_name": "", + "weight": 10.0, + "description": "the synthesizer generates the answer based on the processed evidence", + "source_ids": [ + 124 + ], + "source": "Name: synthesizer\nType: SOFTWARE", + "target": "Name: processed evidence\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "analysis merging generation", + "relation_name": "", + "weight": 9.0, + "description": "the synthesizer operator is the key component used in the final stage of analysis merging generation", + "source_ids": [ + 129 + ], + "source": "Name: synthesizer\nType: SOFTWARE", + "target": "Name: analysis merging generation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "abstract textual queries", + "tgt_entity_name": "concrete operations", + "relation_name": "", + "weight": 8.0, + "description": "abstract textual queries are translated into concrete operations", + "source_ids": [ + 124 + ], + "source": "Name: abstract textual queries\nType: TASK_OR_PROBLEM", + "target": "Name: concrete operations\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "information patches", + "tgt_entity_name": "sensemaking", + "relation_name": "", + "weight": 8.0, + "description": "sensemaking is performed within the information patches", + "source_ids": [ + 124 + ], + "source": "Name: information patches\nType: TASK_OR_PROBLEM", + "target": "Name: sensemaking\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "computational resources", + "tgt_entity_name": "high value data patches", + "relation_name": "", + "weight": 8.0, + "description": "computational resources are focused on high value data patches", + "source_ids": [ + 124 + ], + "source": "Name: computational resources\nType: TASK_OR_PROBLEM", + "target": "Name: high value data patches\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "selector operators", + "relation_name": "", + "weight": 10.0, + "description": "selector operators are the mechanism used within the scent filter based retrieval process to identify relevant patches", + "source_ids": [ + 125 + ], + "source": "Name: scent filter based retrieval\nType: TASK_OR_PROBLEM", + "target": "Name: selector operators\nType: SOFTWARE" + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "node set n", + "relation_name": "", + "weight": 10.0, + "description": "the process reduces the full node set n to a focused subset", + "source_ids": [ + 125 + ], + "source": "Name: scent filter based retrieval\nType: TASK_OR_PROBLEM", + "target": "Name: node set n\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "focused node subset ns", + "relation_name": "", + "weight": 10.0, + "description": "the process results in the creation of the focused node subset ns", + "source_ids": [ + 125 + ], + "source": "Name: scent filter based retrieval\nType: TASK_OR_PROBLEM", + "target": "Name: focused node subset ns\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "equation 13", + "relation_name": "", + "weight": 10.0, + "description": "equation 13 describes the execution of the scent filter based retrieval process", + "source_ids": [ + 125 + ], + "source": "Name: scent filter based retrieval\nType: TASK_OR_PROBLEM", + "target": "Name: equation 13\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "selector operators", + "tgt_entity_name": "params sel", + "relation_name": "", + "weight": 8.0, + "description": "selector operators utilize params sel in their function to reduce the node set", + "source_ids": [ + 125 + ], + "source": "Name: selector operators\nType: SOFTWARE", + "target": "Name: params sel\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "selector operators", + "tgt_entity_name": "patches", + "relation_name": "", + "weight": 9.0, + "description": "selector operators identify relevant patches", + "source_ids": [ + 125 + ], + "source": "Name: selector operators\nType: SOFTWARE", + "target": "Name: patches\nType: PRODUCT" + }, + { + "src_entity_name": "selector operators", + "tgt_entity_name": "explicit filter constraints", + "relation_name": "", + "weight": 9.0, + "description": "selector operators apply explicit filter constraints to identify patches", + "source_ids": [ + 125 + ], + "source": "Name: selector operators\nType: SOFTWARE", + "target": "Name: explicit filter constraints\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "reasoner operators", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "reasoner operators evaluate nodes using multiple dimensions like graph topology and semantic relevance", + "source_ids": [ + 127 + ], + "source": "Name: reasoner operators\nType: TASK_OR_PROBLEM", + "target": "Name: nodes\nType: UNKNOWN" + }, + { + "src_entity_name": "reasoner operators", + "tgt_entity_name": "graph topology", + "relation_name": "", + "weight": 9.0, + "description": "reasoner operators use graph topology as a dimension for evaluation", + "source_ids": [ + 127 + ], + "source": "Name: reasoner operators\nType: TASK_OR_PROBLEM", + "target": "Name: graph topology\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "reasoner operators", + "tgt_entity_name": "semantic relevance", + "relation_name": "", + "weight": 9.0, + "description": "reasoner operators use semantic relevance as a dimension for evaluation", + "source_ids": [ + 127 + ], + "source": "Name: reasoner operators\nType: TASK_OR_PROBLEM", + "target": "Name: semantic relevance\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "final retrieval set", + "relation_name": "", + "weight": 10.0, + "description": "the skyline ranker is employed to generate the final retrieval set", + "source_ids": [ + 127 + ], + "source": "Name: skyline ranker\nType: TASK_OR_PROBLEM", + "target": "Name: final retrieval set\nType: UNKNOWN" + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "equation 14", + "relation_name": "", + "weight": 10.0, + "description": "equation 14 mathematically defines the operation of the skyline ranker", + "source_ids": [ + 127 + ], + "source": "Name: skyline ranker\nType: TASK_OR_PROBLEM", + "target": "Name: equation 14\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "n r", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 10.0, + "description": "n r is the output variable resulting from the skyline ranker operation", + "source_ids": [ + 127 + ], + "source": "Name: skyline ranker\nType: TASK_OR_PROBLEM", + "target": "Name: n r\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "s g n s", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 8.0, + "description": "s g n s is an input component used within the skyline ranker equation", + "source_ids": [ + 127 + ], + "source": "Name: skyline ranker\nType: TASK_OR_PROBLEM", + "target": "Name: s g n s\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "t n", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 8.0, + "description": "t n is an input component used within the skyline ranker equation", + "source_ids": [ + 127 + ], + "source": "Name: skyline ranker\nType: TASK_OR_PROBLEM", + "target": "Name: t n\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "n s", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 8.0, + "description": "n s is the set of nodes provided as input to the skyline ranker equation", + "source_ids": [ + 127 + ], + "source": "Name: skyline ranker\nType: TASK_OR_PROBLEM", + "target": "Name: n s\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "the skyline operator retains nodes that are valuable in at least one dimension and discards dominated ones", + "source_ids": [ + 127 + ], + "source": "Name: skyline operator\nType: TASK_OR_PROBLEM", + "target": "Name: nodes\nType: UNKNOWN" + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "pareto frontier", + "relation_name": "", + "weight": 10.0, + "description": "the skyline operator retains the pareto frontier of nodes", + "source_ids": [ + 127 + ], + "source": "Name: skyline operator\nType: TASK_OR_PROBLEM", + "target": "Name: pareto frontier\nType: CONCEPT" + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "fixed top retrieval", + "relation_name": "", + "weight": 7.0, + "description": "the skyline operator is contrasted with fixed top retrieval in the text", + "source_ids": [ + 127 + ], + "source": "Name: skyline operator\nType: TASK_OR_PROBLEM", + "target": "Name: fixed top retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "n r", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "n r represents the set of retained nodes", + "source_ids": [ + 127 + ], + "source": "Name: n r\nType: PARAMETER_OR_VARIABLE", + "target": "Name: nodes\nType: UNKNOWN" + }, + { + "src_entity_name": "pre selection", + "tgt_entity_name": "noise", + "relation_name": "", + "weight": 9.0, + "description": "pre selection minimizes noise", + "source_ids": [ + 127 + ], + "source": "Name: noise\nType: CONCEPT", + "target": "Name: pre selection\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "pre selection", + "tgt_entity_name": "foraging cost", + "relation_name": "", + "weight": 8.0, + "description": "pre selection optimizes the foraging cost", + "source_ids": [ + 127 + ], + "source": "Name: foraging cost\nType: MEASUREMENT", + "target": "Name: pre selection\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "table: cref='#/texts/136'...", + "tgt_entity_name": "cref", + "relation_name": "", + "weight": 9.0, + "description": "Table 'Table: cref='#/texts/136'...' contains data about 'cref'.", + "source_ids": [ + 132 + ], + "source": "Name: table: cref='#/texts/136'...\nType: TABLE", + "target": "Name: cref\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "map operator", + "tgt_entity_name": "decompose", + "relation_name": "", + "weight": 9.0, + "description": "the map operator analyzes sub problems generated from the decompose process", + "source_ids": [ + 134 + ], + "source": "Name: map operator\nType: TASK_OR_PROBLEM", + "target": "Name: decompose\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "map operator", + "relation_name": "", + "weight": 9.0, + "description": "the reduce operator aggregates the partial results generated by the map operator", + "source_ids": [ + 134 + ], + "source": "Name: map operator\nType: TASK_OR_PROBLEM", + "target": "Name: reduce operator\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "map operator", + "tgt_entity_name": "evidence blocks", + "relation_name": "", + "weight": 9.0, + "description": "the map operator performs analysis on individual evidence blocks", + "source_ids": [ + 134 + ], + "source": "Name: map operator\nType: TASK_OR_PROBLEM", + "target": "Name: evidence blocks\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "map operator", + "tgt_entity_name": "sub problems", + "relation_name": "", + "weight": 9.0, + "description": "the map operator performs analysis on sub problems", + "source_ids": [ + 134 + ], + "source": "Name: map operator\nType: TASK_OR_PROBLEM", + "target": "Name: sub problems\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "map operator", + "tgt_entity_name": "intermediate insights", + "relation_name": "", + "weight": 9.0, + "description": "the map operator generates intermediate insights as its output", + "source_ids": [ + 134 + ], + "source": "Name: map operator\nType: TASK_OR_PROBLEM", + "target": "Name: intermediate insights\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "map operator", + "tgt_entity_name": "detailed content extraction", + "relation_name": "", + "weight": 8.0, + "description": "the map operator is responsible for detailed content extraction", + "source_ids": [ + 134 + ], + "source": "Name: map operator\nType: TASK_OR_PROBLEM", + "target": "Name: detailed content extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "global filter", + "relation_name": "", + "weight": 8.0, + "description": "the reduce operator aggregates statistical counts derived from the global filter", + "source_ids": [ + 134 + ], + "source": "Name: reduce operator\nType: TASK_OR_PROBLEM", + "target": "Name: global filter\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "partial results", + "relation_name": "", + "weight": 9.0, + "description": "the reduce operator aggregates partial results", + "source_ids": [ + 134 + ], + "source": "Name: reduce operator\nType: TASK_OR_PROBLEM", + "target": "Name: partial results\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "answers to decomposed sub queries", + "relation_name": "", + "weight": 8.0, + "description": "the reduce operator aggregates answers to decomposed sub queries", + "source_ids": [ + 134 + ], + "source": "Name: reduce operator\nType: TASK_OR_PROBLEM", + "target": "Name: answers to decomposed sub queries\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "statistical counts", + "relation_name": "", + "weight": 8.0, + "description": "the reduce operator aggregates statistical counts", + "source_ids": [ + 134 + ], + "source": "Name: reduce operator\nType: TASK_OR_PROBLEM", + "target": "Name: statistical counts\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "final response", + "relation_name": "", + "weight": 9.0, + "description": "the reduce operator constructs the final response", + "source_ids": [ + 134 + ], + "source": "Name: reduce operator\nType: TASK_OR_PROBLEM", + "target": "Name: final response\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "high level reasoning synthesis", + "relation_name": "", + "weight": 8.0, + "description": "the reduce operator is responsible for high level reasoning synthesis", + "source_ids": [ + 134 + ], + "source": "Name: reduce operator\nType: TASK_OR_PROBLEM", + "target": "Name: high level reasoning synthesis\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "select by entity", + "tgt_entity_name": "reasoning", + "relation_name": "", + "weight": 7.0, + "description": "the select by entity method is followed by reasoning in the workflow", + "source_ids": [ + 135 + ], + "source": "Name: select by entity\nType: METHOD_OR_TECHNIQUE", + "target": "Name: reasoning\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "reasoning", + "tgt_entity_name": "skyline filtering", + "relation_name": "", + "weight": 7.0, + "description": "reasoning is followed by skyline filtering in the workflow", + "source_ids": [ + 135 + ], + "source": "Name: skyline filtering\nType: METHOD_OR_TECHNIQUE", + "target": "Name: reasoning\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "skyline filtering", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 7.0, + "description": "skyline filtering is followed by the reduce method in the workflow", + "source_ids": [ + 135 + ], + "source": "Name: skyline filtering\nType: METHOD_OR_TECHNIQUE", + "target": "Name: reduce\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "reasoning", + "tgt_entity_name": "disjoint pieces of evidence", + "relation_name": "", + "weight": 8.0, + "description": "reasoning is the action performed on disjoint pieces of evidence", + "source_ids": [ + 179 + ], + "source": "Name: reasoning\nType: METHOD_OR_TECHNIQUE", + "target": "Name: disjoint pieces of evidence\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "experiments", + "tgt_entity_name": "6 experiments", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Experiments' is the primary topic covered in section 6.", + "source_ids": [ + 136 + ], + "source": "Name: 6 experiments\nType: SECTION_TITLE", + "target": "Name: experiments\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "our", + "tgt_entity_name": "experiments", + "relation_name": "", + "weight": 8.0, + "description": "our group conducted the experiments referenced in the text", + "source_ids": [ + 139 + ], + "source": "Name: experiments\nType: TASK_OR_PROBLEM", + "target": "Name: our\nType: ORGANIZATION" + }, + { + "src_entity_name": "datasets", + "tgt_entity_name": "experiments", + "relation_name": "", + "weight": 9.0, + "description": "the datasets listed in table 4 were utilized in the experiments", + "source_ids": [ + 139 + ], + "source": "Name: experiments\nType: TASK_OR_PROBLEM", + "target": "Name: datasets\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "baseline methods", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 8.0, + "description": "the accuracy of baseline methods is evaluated and compared in the experiments", + "source_ids": [ + 137 + ], + "source": "Name: baseline methods\nType: METHOD_OR_TECHNIQUE", + "target": "Name: accuracy\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "exact match", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 9.0, + "description": "exact match is described as being stricter than accuracy", + "source_ids": [ + 229 + ], + "source": "Name: accuracy\nType: EVALUATION_METRIC", + "target": "Name: exact match\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "accuracy", + "tgt_entity_name": "token based f1 score", + "relation_name": "", + "weight": 9.0, + "description": "both are primary evaluation metrics used together in the assessment process", + "source_ids": [ + 144 + ], + "source": "Name: accuracy\nType: EVALUATION_METRIC", + "target": "Name: token based f1 score\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 8.0, + "description": "figure 7 uses accuracy as a metric represented by blue bars for qasper", + "source_ids": [ + 177 + ], + "source": "Name: accuracy\nType: EVALUATION_METRIC", + "target": "Name: figure 7\nType: IMAGE" + }, + { + "src_entity_name": "accuracy", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "accuracy is the specific metric used to evaluate performance on the qasper dataset in the figure", + "source_ids": [ + 177 + ], + "source": "Name: accuracy\nType: EVALUATION_METRIC", + "target": "Name: qasper\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "accuracy", + "tgt_entity_name": "blue bars", + "relation_name": "", + "weight": 9.0, + "description": "accuracy is visually represented by the blue bars in the figure", + "source_ids": [ + 177 + ], + "source": "Name: accuracy\nType: EVALUATION_METRIC", + "target": "Name: blue bars\nType: IMAGE" + }, + { + "src_entity_name": "accuracy", + "tgt_entity_name": "a.1 evaluation metrics", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Accuracy' is a specific evaluation metric detailed as a topic within section A.1.", + "source_ids": [ + 221 + ], + "source": "Name: accuracy\nType: EVALUATION_METRIC", + "target": "Name: a.1 evaluation metrics\nType: SECTION_TITLE" + }, + { + "src_entity_name": "accuracy", + "tgt_entity_name": "a.1.2 qa performance metrics", + "relation_name": "", + "weight": 10.0, + "description": "The metric 'Accuracy' is explicitly defined and detailed within section A.1.2.", + "source_ids": [ + 226 + ], + "source": "Name: accuracy\nType: EVALUATION_METRIC", + "target": "Name: a.1.2 qa performance metrics\nType: SECTION_TITLE" + }, + { + "src_entity_name": "table 4", + "tgt_entity_name": "datasets", + "relation_name": "", + "weight": 10.0, + "description": "table 4 lists the datasets used in the experiments", + "source_ids": [ + 139 + ], + "source": "Name: table 4\nType: TABLE", + "target": "Name: datasets\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "human annotators", + "tgt_entity_name": "table 4", + "relation_name": "", + "weight": 6.0, + "description": "the work of human annotators contributes to the statistics presented in table 4", + "source_ids": [ + 141 + ], + "source": "Name: table 4\nType: TABLE", + "target": "Name: human annotators\nType: PERSON" + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "table 4", + "relation_name": "", + "weight": 7.0, + "description": "statistics for mmlongbench are presented in table 4", + "source_ids": [ + 141 + ], + "source": "Name: table 4\nType: TABLE", + "target": "Name: mmlongbench\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "m3docvqa", + "tgt_entity_name": "table 4", + "relation_name": "", + "weight": 7.0, + "description": "statistics for m3docvqa are presented in table 4", + "source_ids": [ + 141 + ], + "source": "Name: table 4\nType: TABLE", + "target": "Name: m3docvqa\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "qasper", + "tgt_entity_name": "table 4", + "relation_name": "", + "weight": 7.0, + "description": "statistics for qasper are presented in table 4", + "source_ids": [ + 141 + ], + "source": "Name: table 4\nType: TABLE", + "target": "Name: qasper\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "exact match", + "tgt_entity_name": "em", + "relation_name": "", + "weight": 10.0, + "description": "exact match is the definition of the abbreviation em", + "source_ids": [ + 170 + ], + "source": "Name: em\nType: EVALUATION_METRIC", + "target": "Name: exact match\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "table 7", + "tgt_entity_name": "em", + "relation_name": "", + "weight": 9.0, + "description": "table 7 uses em exact match as a metric to evaluate qa performance", + "source_ids": [ + 170 + ], + "source": "Name: em\nType: EVALUATION_METRIC", + "target": "Name: table 7\nType: TABLE" + }, + { + "src_entity_name": "em", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 8.0, + "description": "em and f1 are both evaluation metrics used together to compare qa performance in table 7", + "source_ids": [ + 170 + ], + "source": "Name: em\nType: EVALUATION_METRIC", + "target": "Name: f1\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "f1 score", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 10.0, + "description": "f1 score is the definition of the abbreviation f1", + "source_ids": [ + 170 + ], + "source": "Name: f1\nType: EVALUATION_METRIC", + "target": "Name: f1 score\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "table 7", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 9.0, + "description": "table 7 uses f1 f1 score as a metric to evaluate qa performance", + "source_ids": [ + 170 + ], + "source": "Name: f1\nType: EVALUATION_METRIC", + "target": "Name: table 7\nType: TABLE" + }, + { + "src_entity_name": "exact match", + "tgt_entity_name": "token based f1 score", + "relation_name": "", + "weight": 9.0, + "description": "both are primary evaluation metrics used together in the assessment process", + "source_ids": [ + 144 + ], + "source": "Name: exact match\nType: EVALUATION_METRIC", + "target": "Name: token based f1 score\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "exact match", + "relation_name": "", + "weight": 8.0, + "description": "figure 7 uses exact match as a metric represented by blue bars for mmlongbench", + "source_ids": [ + 177 + ], + "source": "Name: exact match\nType: EVALUATION_METRIC", + "target": "Name: figure 7\nType: IMAGE" + }, + { + "src_entity_name": "exact match", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "exact match is the specific metric used to evaluate performance on the mmlongbench dataset in the figure", + "source_ids": [ + 177 + ], + "source": "Name: exact match\nType: EVALUATION_METRIC", + "target": "Name: mmlongbench\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "exact match", + "tgt_entity_name": "blue bars", + "relation_name": "", + "weight": 9.0, + "description": "exact match is visually represented by the blue bars in the figure", + "source_ids": [ + 177 + ], + "source": "Name: exact match\nType: EVALUATION_METRIC", + "target": "Name: blue bars\nType: IMAGE" + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "f1 score", + "relation_name": "", + "weight": 8.0, + "description": "figure 7 uses f1 score as a metric represented by red bars", + "source_ids": [ + 177 + ], + "source": "Name: f1 score\nType: EVALUATION_METRIC", + "target": "Name: figure 7\nType: IMAGE" + }, + { + "src_entity_name": "f1 score", + "tgt_entity_name": "red bars", + "relation_name": "", + "weight": 9.0, + "description": "f1 score is visually represented by the red bars in the figure", + "source_ids": [ + 177 + ], + "source": "Name: f1 score\nType: EVALUATION_METRIC", + "target": "Name: red bars\nType: IMAGE" + }, + { + "src_entity_name": "f1 score", + "tgt_entity_name": "token level f1 score", + "relation_name": "", + "weight": 9.0, + "description": "the token level f1 score is a specific application of the f1 score for text span answers", + "source_ids": [ + 231 + ], + "source": "Name: f1 score\nType: EVALUATION_METRIC", + "target": "Name: token level f1 score\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "f1 score", + "tgt_entity_name": "equation 19", + "relation_name": "", + "weight": 10.0, + "description": "equation 19 provides the mathematical formula for calculating the f1 score", + "source_ids": [ + 231 + ], + "source": "Name: f1 score\nType: EVALUATION_METRIC", + "target": "Name: equation 19\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "datasets", + "relation_name": "", + "weight": 10.0, + "description": "table 5 evaluates methods across various datasets", + "source_ids": [ + 153 + ], + "source": "Name: datasets\nType: DATASET_OR_CORPUS", + "target": "Name: table 5\nType: TABLE" + }, + { + "src_entity_name": "performance comparison", + "tgt_entity_name": "datasets", + "relation_name": "", + "weight": 8.0, + "description": "the performance comparison is conducted across various datasets", + "source_ids": [ + 153 + ], + "source": "Name: datasets\nType: DATASET_OR_CORPUS", + "target": "Name: performance comparison\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "datasets", + "relation_name": "", + "weight": 8.0, + "description": "the gradient based er method s performance is evaluated across multiple datasets", + "source_ids": [ + 176 + ], + "source": "Name: datasets\nType: DATASET_OR_CORPUS", + "target": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "table: cref='#/texts/143'...", + "tgt_entity_name": "texts/143", + "relation_name": "", + "weight": 9.0, + "description": "Table 'Table: cref='#/texts/143'...' contains data about 'texts/143'.", + "source_ids": [ + 140 + ], + "source": "Name: table: cref='#/texts/143'...\nType: TABLE", + "target": "Name: texts/143\nType: SECTION_TITLE" + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "m3docvqa", + "relation_name": "", + "weight": 9.0, + "description": "both are widely adopted benchmarking datasets used for complex document qa tasks", + "source_ids": [ + 141 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: m3docvqa\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "both are widely adopted benchmarking datasets used for complex document qa tasks", + "source_ids": [ + 141 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: qasper\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "guidebooks", + "relation_name": "", + "weight": 8.0, + "description": "mmlongbench covers guidebooks as a category of documents", + "source_ids": [ + 141 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: guidebooks\nType: PRODUCT" + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "financial reports", + "relation_name": "", + "weight": 8.0, + "description": "mmlongbench covers financial reports as a category of documents", + "source_ids": [ + 141 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: financial reports\nType: PRODUCT" + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "industry files", + "relation_name": "", + "weight": 8.0, + "description": "mmlongbench covers industry files as a category of documents", + "source_ids": [ + 141 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: industry files\nType: PRODUCT" + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "page numbers", + "relation_name": "", + "weight": 7.0, + "description": "mmlongbench provides page numbers used to filter candidate blocks", + "source_ids": [ + 144 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: page numbers\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to MMLongBench", + "source_ids": [ + 159 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "docetl consumes over 53 million tokens on the mmlongbench dataset", + "source_ids": [ + 160 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: docetl\nType: PRODUCT" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to MMLongBench", + "source_ids": [ + 175 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: cref='#/texts/224'\nType: IMAGE" + }, + { + "src_entity_name": "figure 8", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "figure 8 presents a case study involving responses from mmlongbench", + "source_ids": [ + 181 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: figure 8\nType: IMAGE" + }, + { + "src_entity_name": "case study", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "the case study uses responses from mmlongbench", + "source_ids": [ + 181 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: case study\nType: EVENT" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to MMLongBench", + "source_ids": [ + 182 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "figure 9", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "figure 9 presents an error analysis on queries sampled from the mmlongbench dataset", + "source_ids": [ + 183 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: figure 9\nType: IMAGE" + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "200", + "relation_name": "", + "weight": 8.0, + "description": "200 sampled queries were taken from the mmlongbench dataset", + "source_ids": [ + 183 + ], + "source": "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: 200\nType: MEASUREMENT" + }, + { + "src_entity_name": "m3docvqa", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "both are widely adopted benchmarking datasets used for complex document qa tasks", + "source_ids": [ + 141 + ], + "source": "Name: m3docvqa\nType: DATASET_OR_CORPUS", + "target": "Name: qasper\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "m3docvqa", + "tgt_entity_name": "html type documents", + "relation_name": "", + "weight": 9.0, + "description": "m3docvqa tests rag systems on a collection of html type documents", + "source_ids": [ + 141 + ], + "source": "Name: m3docvqa\nType: DATASET_OR_CORPUS", + "target": "Name: html type documents\nType: PRODUCT" + }, + { + "src_entity_name": "m3docvqa", + "tgt_entity_name": "rag systems", + "relation_name": "", + "weight": 10.0, + "description": "m3docvqa is designed to test rag systems", + "source_ids": [ + 141 + ], + "source": "Name: m3docvqa\nType: DATASET_OR_CORPUS", + "target": "Name: rag systems\nType: SOFTWARE" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "m3docvqa", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to M3DocVQA", + "source_ids": [ + 159 + ], + "source": "Name: m3docvqa\nType: DATASET_OR_CORPUS", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "qasper", + "tgt_entity_name": "scientific papers", + "relation_name": "", + "weight": 10.0, + "description": "qasper is focused on scientific papers", + "source_ids": [ + 141 + ], + "source": "Name: qasper\nType: DATASET_OR_CORPUS", + "target": "Name: scientific papers\nType: PRODUCT" + }, + { + "src_entity_name": "qasper", + "tgt_entity_name": "evidence statements", + "relation_name": "", + "weight": 7.0, + "description": "qasper provides evidence statements used to filter candidate blocks", + "source_ids": [ + 144 + ], + "source": "Name: qasper\nType: DATASET_OR_CORPUS", + "target": "Name: evidence statements\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Qasper", + "source_ids": [ + 159 + ], + "source": "Name: qasper\nType: DATASET_OR_CORPUS", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "w o selector variant", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 7.0, + "description": "the w o selector variant incurs a computational cost measured in tokens on the qasper dataset", + "source_ids": [ + 172 + ], + "source": "Name: qasper\nType: DATASET_OR_CORPUS", + "target": "Name: w o selector variant\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "ift inspired selection mechanism", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 7.0, + "description": "the ift inspired selection mechanism s efficiency is validated using the qasper dataset", + "source_ids": [ + 172 + ], + "source": "Name: qasper\nType: DATASET_OR_CORPUS", + "target": "Name: ift inspired selection mechanism\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Qasper", + "source_ids": [ + 175 + ], + "source": "Name: qasper\nType: DATASET_OR_CORPUS", + "target": "Name: cref='#/texts/224'\nType: IMAGE" + }, + { + "src_entity_name": "figure 8", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "figure 8 presents a case study involving responses from qasper", + "source_ids": [ + 181 + ], + "source": "Name: qasper\nType: DATASET_OR_CORPUS", + "target": "Name: figure 8\nType: IMAGE" + }, + { + "src_entity_name": "case study", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "the case study uses responses from qasper", + "source_ids": [ + 181 + ], + "source": "Name: qasper\nType: DATASET_OR_CORPUS", + "target": "Name: case study\nType: EVENT" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Qasper", + "source_ids": [ + 182 + ], + "source": "Name: qasper\nType: DATASET_OR_CORPUS", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "figure 9", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "figure 9 presents an error analysis on queries sampled from the qasper dataset", + "source_ids": [ + 183 + ], + "source": "Name: qasper\nType: DATASET_OR_CORPUS", + "target": "Name: figure 9\nType: IMAGE" + }, + { + "src_entity_name": "qasper", + "tgt_entity_name": "200", + "relation_name": "", + "weight": 8.0, + "description": "200 sampled queries were taken from the qasper dataset", + "source_ids": [ + 183 + ], + "source": "Name: qasper\nType: DATASET_OR_CORPUS", + "target": "Name: 200\nType: MEASUREMENT" + }, + { + "src_entity_name": "human annotators", + "tgt_entity_name": "qa pairs", + "relation_name": "", + "weight": 9.0, + "description": "human annotators answer and refine qa pairs", + "source_ids": [ + 141 + ], + "source": "Name: human annotators\nType: PERSON", + "target": "Name: qa pairs\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "20", + "relation_name": "", + "weight": 10.0, + "description": "the gradient based er method achieves a boost of over 20 in graph density", + "source_ids": [ + 176 + ], + "source": "Name: 20\nType: PERCENTAGE", + "target": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "html type documents", + "tgt_entity_name": "wikipedia pages", + "relation_name": "", + "weight": 10.0, + "description": "the html type documents are sourced from wikipedia pages", + "source_ids": [ + 141 + ], + "source": "Name: html type documents\nType: PRODUCT", + "target": "Name: wikipedia pages\nType: LOCATION" + }, + { + "src_entity_name": "paper", + "tgt_entity_name": "figures", + "relation_name": "", + "weight": 6.0, + "description": "the example query asks to count figures in the paper", + "source_ids": [ + 258 + ], + "source": "Name: figures\nType: IMAGE", + "target": "Name: paper\nType: BOOK" + }, + { + "src_entity_name": "wikipedia", + "tgt_entity_name": "https www wikipedia org", + "relation_name": "", + "weight": 10.0, + "description": "wikipedia is the organization represented by the url https www wikipedia org", + "source_ids": [ + 142 + ], + "source": "Name: wikipedia\nType: ORGANIZATION", + "target": "Name: https www wikipedia org\nType: LOCATION" + }, + { + "src_entity_name": "time cost", + "tgt_entity_name": "token usage", + "relation_name": "", + "weight": 8.0, + "description": "both are metrics used to assess efficiency during the response phase", + "source_ids": [ + 144 + ], + "source": "Name: time cost\nType: EVALUATION_METRIC", + "target": "Name: token usage\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "time cost", + "tgt_entity_name": "response phase", + "relation_name": "", + "weight": 8.0, + "description": "time cost is measured during the response phase", + "source_ids": [ + 144 + ], + "source": "Name: time cost\nType: EVALUATION_METRIC", + "target": "Name: response phase\nType: TIME" + }, + { + "src_entity_name": "token usage", + "tgt_entity_name": "response phase", + "relation_name": "", + "weight": 8.0, + "description": "token usage is measured during the response phase", + "source_ids": [ + 144 + ], + "source": "Name: token usage\nType: EVALUATION_METRIC", + "target": "Name: response phase\nType: TIME" + }, + { + "src_entity_name": "pdf parsing", + "tgt_entity_name": "pdf blocks", + "relation_name": "", + "weight": 7.0, + "description": "pdf parsing errors affect the availability of items within pdf blocks", + "source_ids": [ + 144 + ], + "source": "Name: pdf parsing\nType: METHOD_OR_TECHNIQUE", + "target": "Name: pdf blocks\nType: TABLE" + }, + { + "src_entity_name": "texts", + "tgt_entity_name": "formulas", + "relation_name": "", + "weight": 6.0, + "description": "both are types of pdf blocks manually labeled to establish ground truth", + "source_ids": [ + 144 + ], + "source": "Name: texts\nType: TABLE", + "target": "Name: formulas\nType: TABLE" + }, + { + "src_entity_name": "ground truth", + "tgt_entity_name": "pdf blocks", + "relation_name": "", + "weight": 10.0, + "description": "pdf blocks are manually labeled to establish the ground truth", + "source_ids": [ + 144 + ], + "source": "Name: ground truth\nType: CONCEPT", + "target": "Name: pdf blocks\nType: TABLE" + }, + { + "src_entity_name": "metadata", + "tgt_entity_name": "ground truth", + "relation_name": "", + "weight": 8.0, + "description": "metadata provides the ground truth evidence used to guide the labeling process", + "source_ids": [ + 144 + ], + "source": "Name: ground truth\nType: CONCEPT", + "target": "Name: metadata\nType: CONCEPT" + }, + { + "src_entity_name": "llm based extraction step", + "tgt_entity_name": "ground truth", + "relation_name": "", + "weight": 9.0, + "description": "the llm based extraction step aligns the model output with the ground truth format", + "source_ids": [ + 224 + ], + "source": "Name: ground truth\nType: CONCEPT", + "target": "Name: llm based extraction step\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "candidate blocks", + "tgt_entity_name": "modality", + "relation_name": "", + "weight": 8.0, + "description": "candidate blocks are filtered using the given modality", + "source_ids": [ + 144 + ], + "source": "Name: modality\nType: CONCEPT", + "target": "Name: candidate blocks\nType: TABLE" + }, + { + "src_entity_name": "candidate blocks", + "tgt_entity_name": "pdf blocks", + "relation_name": "", + "weight": 7.0, + "description": "candidate blocks are filtered from the set of pdf blocks", + "source_ids": [ + 144 + ], + "source": "Name: pdf blocks\nType: TABLE", + "target": "Name: candidate blocks\nType: TABLE" + }, + { + "src_entity_name": "candidate blocks", + "tgt_entity_name": "page numbers", + "relation_name": "", + "weight": 8.0, + "description": "candidate blocks are filtered using page numbers from mmlongbench", + "source_ids": [ + 144 + ], + "source": "Name: candidate blocks\nType: TABLE", + "target": "Name: page numbers\nType: UNKNOWN" + }, + { + "src_entity_name": "candidate blocks", + "tgt_entity_name": "evidence statements", + "relation_name": "", + "weight": 8.0, + "description": "candidate blocks are filtered using evidence statements from qasper", + "source_ids": [ + 144 + ], + "source": "Name: candidate blocks\nType: TABLE", + "target": "Name: evidence statements\nType: UNKNOWN" + }, + { + "src_entity_name": "baselines", + "tgt_entity_name": "three model configurations", + "relation_name": "", + "weight": 9.0, + "description": "the baselines consist of or are defined by the three model configurations used in the experiments", + "source_ids": [ + 145 + ], + "source": "Name: baselines\nType: TASK_OR_PROBLEM", + "target": "Name: three model configurations\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "our experiments", + "tgt_entity_name": "baselines", + "relation_name": "", + "weight": 9.0, + "description": "the experiments consider the baselines as part of their evaluation process", + "source_ids": [ + 145 + ], + "source": "Name: baselines\nType: TASK_OR_PROBLEM", + "target": "Name: our experiments\nType: EVENT" + }, + { + "src_entity_name": "our experiments", + "tgt_entity_name": "three model configurations", + "relation_name": "", + "weight": 10.0, + "description": "the experiments explicitly consider three model configurations as their primary focus", + "source_ids": [ + 145 + ], + "source": "Name: three model configurations\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: our experiments\nType: EVENT" + }, + { + "src_entity_name": "conventional rag", + "tgt_entity_name": "bm25", + "relation_name": "", + "weight": 9.0, + "description": "conventional rag is the pipeline where bm25 is selected as a retrieval model", + "source_ids": [ + 146 + ], + "source": "Name: conventional rag\nType: TASK_OR_PROBLEM", + "target": "Name: bm25\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "conventional rag", + "tgt_entity_name": "vanilla rag", + "relation_name": "", + "weight": 9.0, + "description": "conventional rag is the pipeline where vanilla rag is selected as a retrieval model", + "source_ids": [ + 146 + ], + "source": "Name: conventional rag\nType: TASK_OR_PROBLEM", + "target": "Name: vanilla rag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "layout vanilla", + "tgt_entity_name": "conventional rag", + "relation_name": "", + "weight": 8.0, + "description": "layout vanilla is implemented as part of the conventional rag pipeline described in the text", + "source_ids": [ + 146 + ], + "source": "Name: conventional rag\nType: TASK_OR_PROBLEM", + "target": "Name: layout vanilla\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "layout vanilla", + "tgt_entity_name": "vanilla rag", + "relation_name": "", + "weight": 10.0, + "description": "layout vanilla is a variant that builds upon vanilla rag by adding document layout analysis", + "source_ids": [ + 146 + ], + "source": "Name: vanilla rag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: layout vanilla\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "graphrag global", + "relation_name": "", + "weight": 10.0, + "description": "graphrag global is a version of graphrag that uses global search methods", + "source_ids": [ + 147 + ], + "source": "Name: graphrag\nType: TECHNOLOGY", + "target": "Name: graphrag global\nType: TECHNOLOGY" + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "graphrag local", + "relation_name": "", + "weight": 10.0, + "description": "graphrag local is a version of graphrag that uses local search methods", + "source_ids": [ + 147 + ], + "source": "Name: graphrag\nType: TECHNOLOGY", + "target": "Name: graphrag local\nType: TECHNOLOGY" + }, + { + "src_entity_name": "graphrag global", + "tgt_entity_name": "global search methods", + "relation_name": "", + "weight": 10.0, + "description": "graphrag global employs global search methods", + "source_ids": [ + 147 + ], + "source": "Name: graphrag global\nType: TECHNOLOGY", + "target": "Name: global search methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "graphrag local", + "tgt_entity_name": "local search methods", + "relation_name": "", + "weight": 10.0, + "description": "graphrag local employs local search methods", + "source_ids": [ + 147 + ], + "source": "Name: graphrag local\nType: TECHNOLOGY", + "target": "Name: local search methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "layoutsegmentedrag", + "tgt_entity_name": "mm vanilla", + "relation_name": "", + "weight": 9.0, + "description": "mm vanilla is included as a method within the layoutsegmentedrag category", + "source_ids": [ + 148 + ], + "source": "Name: layoutsegmentedrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: mm vanilla\nType: PRODUCT" + }, + { + "src_entity_name": "layoutsegmentedrag", + "tgt_entity_name": "treetraverse", + "relation_name": "", + "weight": 9.0, + "description": "treetraverse is included as a method within the layoutsegmentedrag category", + "source_ids": [ + 148 + ], + "source": "Name: layoutsegmentedrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: treetraverse\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "layoutsegmentedrag", + "tgt_entity_name": "graphranker", + "relation_name": "", + "weight": 9.0, + "description": "graphranker is included as a method within the layoutsegmentedrag category", + "source_ids": [ + 148 + ], + "source": "Name: layoutsegmentedrag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: graphranker\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "treetraverse", + "tgt_entity_name": "pageindex", + "relation_name": "", + "weight": 8.0, + "description": "treetraverse is inspired by pageindex", + "source_ids": [ + 148 + ], + "source": "Name: pageindex\nType: PRODUCT", + "target": "Name: treetraverse\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "pageindex", + "tgt_entity_name": "page 39", + "relation_name": "", + "weight": 5.0, + "description": "pageindex is referenced in citation page 39", + "source_ids": [ + 148 + ], + "source": "Name: pageindex\nType: PRODUCT", + "target": "Name: page 39\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "graphranker", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "graphranker is extended from hipporag", + "source_ids": [ + 148 + ], + "source": "Name: graphranker\nType: METHOD_OR_TECHNIQUE", + "target": "Name: hipporag\nType: METHOD_OR_ARCHITECTURE" + }, + { + "src_entity_name": "graphranker", + "tgt_entity_name": "personalized pagerank", + "relation_name": "", + "weight": 9.0, + "description": "graphranker applies personalized pagerank to rank relevant nodes", + "source_ids": [ + 148 + ], + "source": "Name: graphranker\nType: METHOD_OR_TECHNIQUE", + "target": "Name: personalized pagerank\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "graphranker", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to GraphRanker", + "source_ids": [ + 159 + ], + "source": "Name: graphranker\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "hipporag", + "tgt_entity_name": "page 19", + "relation_name": "", + "weight": 5.0, + "description": "hipporag is referenced in citation page 19", + "source_ids": [ + 148 + ], + "source": "Name: hipporag\nType: METHOD_OR_ARCHITECTURE", + "target": "Name: page 19\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "personalized pagerank", + "tgt_entity_name": "page 20", + "relation_name": "", + "weight": 5.0, + "description": "personalized pagerank is referenced in citation page 20", + "source_ids": [ + 148 + ], + "source": "Name: personalized pagerank\nType: METHOD_OR_TECHNIQUE", + "target": "Name: page 20\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "baseline methods", + "tgt_entity_name": "qwen family", + "relation_name": "", + "weight": 9.0, + "description": "baseline methods are also powered by backbone models from the qwen family", + "source_ids": [ + 149 + ], + "source": "Name: qwen family\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: baseline methods\nType: UNKNOWN" + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "vlm", + "relation_name": "", + "weight": 9.0, + "description": "the qwen family includes vlms used in the experiments", + "source_ids": [ + 238 + ], + "source": "Name: qwen family\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: vlm\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "embedding models", + "relation_name": "", + "weight": 9.0, + "description": "the qwen family includes embedding models used in the experiments", + "source_ids": [ + 238 + ], + "source": "Name: qwen family\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: embedding models\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "qwen3 8b", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 8b is a specific model from the qwen family used as the default llm", + "source_ids": [ + 238 + ], + "source": "Name: qwen family\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: qwen3 8b\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "qwen2 5vl 30b", + "relation_name": "", + "weight": 10.0, + "description": "qwen2 5vl 30b is a specific model from the qwen family used as the vlm", + "source_ids": [ + 238 + ], + "source": "Name: qwen family\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: qwen2 5vl 30b\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "qwen3 embedding 0 6b", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 embedding 0 6b is a specific model from the qwen family used for text embedding", + "source_ids": [ + 238 + ], + "source": "Name: qwen family\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: qwen3 embedding 0 6b\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "gme qwen2 vl 2b instruct", + "relation_name": "", + "weight": 10.0, + "description": "gme qwen2 vl 2b instruct is a specific model from the qwen family used for multi modal embedding", + "source_ids": [ + 238 + ], + "source": "Name: qwen family\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: gme qwen2 vl 2b instruct\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "qwen3 reranker 4b", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 reranker 4b is a specific model from the qwen family used for reranking", + "source_ids": [ + 238 + ], + "source": "Name: qwen family\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: qwen3 reranker 4b\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "mineru", + "tgt_entity_name": "robust document layout parsing", + "relation_name": "", + "weight": 10.0, + "description": "mineru is utilized for robust document layout parsing", + "source_ids": [ + 238 + ], + "source": "Name: mineru\nType: SOFTWARE", + "target": "Name: robust document layout parsing\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "mineru", + "tgt_entity_name": "reference 52", + "relation_name": "", + "weight": 10.0, + "description": "mineru is cited in reference 52", + "source_ids": [ + 238 + ], + "source": "Name: mineru\nType: SOFTWARE", + "target": "Name: reference 52\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "github com sam234990 bookrag", + "tgt_entity_name": "prompts", + "relation_name": "", + "weight": 10.0, + "description": "prompts are available at the specified github location", + "source_ids": [ + 149 + ], + "source": "Name: github com sam234990 bookrag\nType: LOCATION", + "target": "Name: prompts\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "github com sam234990 bookrag", + "tgt_entity_name": "detailed configurations", + "relation_name": "", + "weight": 10.0, + "description": "detailed configurations are available at the specified github location", + "source_ids": [ + 149 + ], + "source": "Name: github com sam234990 bookrag\nType: LOCATION", + "target": "Name: detailed configurations\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "implementation details", + "tgt_entity_name": "technical report", + "relation_name": "", + "weight": 7.0, + "description": "more details about the implementation are provided in the technical report", + "source_ids": [ + 149 + ], + "source": "Name: technical report\nType: PUBLICATION_VENUE", + "target": "Name: implementation details\nType: UNKNOWN" + }, + { + "src_entity_name": "technical report", + "tgt_entity_name": "appendix", + "relation_name": "", + "weight": 9.0, + "description": "the appendix is a section within the technical report containing more details", + "source_ids": [ + 149 + ], + "source": "Name: technical report\nType: PUBLICATION_VENUE", + "target": "Name: appendix\nType: SECTION_TITLE" + }, + { + "src_entity_name": "qwen2 5 vl technical report", + "tgt_entity_name": "technical report", + "relation_name": "", + "weight": 9.0, + "description": "the document is identified as a technical report", + "source_ids": [ + 194 + ], + "source": "Name: technical report\nType: PUBLICATION_VENUE", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "performance comparison", + "relation_name": "", + "weight": 10.0, + "description": "table 5 presents the performance comparison of different methods", + "source_ids": [ + 153 + ], + "source": "Name: table 5\nType: TABLE", + "target": "Name: performance comparison\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "different methods", + "relation_name": "", + "weight": 10.0, + "description": "table 5 compares the performance of different methods", + "source_ids": [ + 153 + ], + "source": "Name: table 5\nType: TABLE", + "target": "Name: different methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "best results", + "relation_name": "", + "weight": 9.0, + "description": "table 5 marks the best results in bold", + "source_ids": [ + 153 + ], + "source": "Name: table 5\nType: TABLE", + "target": "Name: best results\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "second best results", + "relation_name": "", + "weight": 9.0, + "description": "table 5 marks the second best results in underlined", + "source_ids": [ + 153 + ], + "source": "Name: table 5\nType: TABLE", + "target": "Name: second best results\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "bold", + "relation_name": "", + "weight": 9.0, + "description": "table 5 uses bold formatting to highlight specific results", + "source_ids": [ + 153 + ], + "source": "Name: table 5\nType: TABLE", + "target": "Name: bold\nType: COLOR" + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "underlined", + "relation_name": "", + "weight": 9.0, + "description": "table 5 uses underlined formatting to highlight specific results", + "source_ids": [ + 153 + ], + "source": "Name: table 5\nType: TABLE", + "target": "Name: underlined\nType: SHAPE" + }, + { + "src_entity_name": "layout vanilla", + "tgt_entity_name": "vanilla rag", + "relation_name": "", + "weight": 9.0, + "description": "layout vanilla consistently outperforms vanilla rag in the comparison", + "source_ids": [ + 152 + ], + "source": "Name: layout vanilla\nType: PRODUCT", + "target": "Name: vanilla rag\nType: PRODUCT" + }, + { + "src_entity_name": "tree traverse", + "tgt_entity_name": "graphranker", + "relation_name": "", + "weight": 7.0, + "description": "both tree traverse and graphranker are highlighted for having suboptimal results due to similar limitations", + "source_ids": [ + 152 + ], + "source": "Name: tree traverse\nType: PRODUCT", + "target": "Name: graphranker\nType: PRODUCT" + }, + { + "src_entity_name": "tree traverse", + "tgt_entity_name": "hierarchical navigation", + "relation_name": "", + "weight": 9.0, + "description": "tree traverse relies on hierarchical navigation which leads to suboptimal results", + "source_ids": [ + 152 + ], + "source": "Name: tree traverse\nType: PRODUCT", + "target": "Name: hierarchical navigation\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "tree traverse", + "tgt_entity_name": "cross sectional context", + "relation_name": "", + "weight": 8.0, + "description": "tree traverse often misses cross sectional context due to its reliance on hierarchical navigation", + "source_ids": [ + 152 + ], + "source": "Name: tree traverse\nType: PRODUCT", + "target": "Name: cross sectional context\nType: CONCEPT" + }, + { + "src_entity_name": "graphranker", + "tgt_entity_name": "graph based reasoning", + "relation_name": "", + "weight": 9.0, + "description": "graphranker relies on graph based reasoning which leads to suboptimal results", + "source_ids": [ + 152 + ], + "source": "Name: graphranker\nType: PRODUCT", + "target": "Name: graph based reasoning\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "graphranker", + "tgt_entity_name": "irrelevant scopes", + "relation_name": "", + "weight": 8.0, + "description": "graphranker often drifts into irrelevant scopes due to its reliance on graph based reasoning", + "source_ids": [ + 152 + ], + "source": "Name: graphranker\nType: PRODUCT", + "target": "Name: irrelevant scopes\nType: CONCEPT" + }, + { + "src_entity_name": "qa performance", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 9.0, + "description": "qa performance is measured under different query types", + "source_ids": [ + 179 + ], + "source": "Name: qa performance\nType: TASK_OR_PROBLEM", + "target": "Name: query types\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "performance comparison", + "tgt_entity_name": "different methods", + "relation_name": "", + "weight": 8.0, + "description": "the performance comparison involves evaluating different methods", + "source_ids": [ + 153 + ], + "source": "Name: performance comparison\nType: TASK_OR_PROBLEM", + "target": "Name: different methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "best results", + "tgt_entity_name": "bold", + "relation_name": "", + "weight": 10.0, + "description": "the best results are identified by being marked in bold", + "source_ids": [ + 153 + ], + "source": "Name: best results\nType: EVALUATION_METRIC", + "target": "Name: bold\nType: COLOR" + }, + { + "src_entity_name": "second best results", + "tgt_entity_name": "underlined", + "relation_name": "", + "weight": 10.0, + "description": "the second best results are identified by being marked as underlined", + "source_ids": [ + 153 + ], + "source": "Name: second best results\nType: EVALUATION_METRIC", + "target": "Name: underlined\nType: SHAPE" + }, + { + "src_entity_name": "table: cref='#/texts/156'...", + "tgt_entity_name": "cref", + "relation_name": "", + "weight": 9.0, + "description": "Table 'Table: cref='#/texts/156'...' contains data about 'cref'.", + "source_ids": [ + 154 + ], + "source": "Name: table: cref='#/texts/156'...\nType: TABLE", + "target": "Name: cref\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "table: cref='#/texts/220'...", + "tgt_entity_name": "cref", + "relation_name": "", + "weight": 9.0, + "description": "Table 'Table: cref='#/texts/220'...' contains data about 'cref'.", + "source_ids": [ + 171 + ], + "source": "Name: cref\nType: PARAMETER_OR_VARIABLE", + "target": "Name: table: cref='#/texts/220'...\nType: TABLE" + }, + { + "src_entity_name": "table 6", + "tgt_entity_name": "layout based methods", + "relation_name": "", + "weight": 10.0, + "description": "table 6 compares the performance of various layout based methods", + "source_ids": [ + 155 + ], + "source": "Name: table 6\nType: TABLE", + "target": "Name: layout based methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "proc vldb endow", + "tgt_entity_name": "10", + "relation_name": "", + "weight": 8.0, + "description": "the paper was published in issue 10 of proc vldb endow", + "source_ids": [ + 196 + ], + "source": "Name: 10\nType: MEASUREMENT", + "target": "Name: proc vldb endow\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "page", + "tgt_entity_name": "10", + "relation_name": "", + "weight": 7.0, + "description": "10 is part of the page range", + "source_ids": [ + 258 + ], + "source": "Name: 10\nType: MEASUREMENT", + "target": "Name: page\nType: MEASUREMENT" + }, + { + "src_entity_name": "figure 5", + "tgt_entity_name": "query efficiency", + "relation_name": "", + "weight": 9.0, + "description": "figure 5 displays a comparison of the query efficiency metric", + "source_ids": [ + 158 + ], + "source": "Name: figure 5\nType: IMAGE", + "target": "Name: query efficiency\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "figure 5", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Figure 5", + "source_ids": [ + 159 + ], + "source": "Name: figure 5\nType: IMAGE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "bm25", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to BM25", + "source_ids": [ + 159 + ], + "source": "Name: bm25\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "vanilla rag", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Vanilla RAG", + "source_ids": [ + 159 + ], + "source": "Name: vanilla rag\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "layout + vanilla", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Layout + Vanilla", + "source_ids": [ + 159 + ], + "source": "Name: layout + vanilla\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "raptor", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to RAPTOR", + "source_ids": [ + 159 + ], + "source": "Name: raptor\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "graphrag-local", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to GraphRAG-Local", + "source_ids": [ + 159 + ], + "source": "Name: graphrag-local\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "graphrag-global", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to GraphRAG-Global", + "source_ids": [ + 159 + ], + "source": "Name: graphrag-global\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "mm-vanilla", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to MM-Vanilla", + "source_ids": [ + 159 + ], + "source": "Name: mm-vanilla\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "tree-traverse", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Tree-Traverse", + "source_ids": [ + 159 + ], + "source": "Name: tree-traverse\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "query time", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Query Time", + "source_ids": [ + 159 + ], + "source": "Name: query time\nType: EVALUATION_METRIC", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "token cost", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Token cost", + "source_ids": [ + 159 + ], + "source": "Name: token cost\nType: EVALUATION_METRIC", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "time (s)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to time (s)", + "source_ids": [ + 159 + ], + "source": "Name: time (s)\nType: MEASUREMENT", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "token (m)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to token (M)", + "source_ids": [ + 159 + ], + "source": "Name: token (m)\nType: MEASUREMENT", + "target": "Name: image cref='#/texts/161'\nType: UNKNOWN" + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "53 million tokens", + "relation_name": "", + "weight": 10.0, + "description": "docetl consumes 53 million tokens on the mmlongbench dataset", + "source_ids": [ + 160 + ], + "source": "Name: docetl\nType: PRODUCT", + "target": "Name: 53 million tokens\nType: MEASUREMENT" + }, + { + "src_entity_name": "gradient based er", + "tgt_entity_name": "qa performance", + "relation_name": "", + "weight": 9.0, + "description": "gradient based er is evaluated for its impact on qa performance", + "source_ids": [ + 163 + ], + "source": "Name: gradient based er\nType: METHOD_OR_TECHNIQUE", + "target": "Name: qa performance\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 9.0, + "description": "figure 7 breaks down performance by different query types", + "source_ids": [ + 177 + ], + "source": "Name: query types\nType: TASK_OR_PROBLEM", + "target": "Name: figure 7\nType: IMAGE" + }, + { + "src_entity_name": "agent based planning strategy", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 9.0, + "description": "the agent based planning strategy is designed to handle different query types separately", + "source_ids": [ + 179 + ], + "source": "Name: query types\nType: TASK_OR_PROBLEM", + "target": "Name: agent based planning strategy\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "case study", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 9.0, + "description": "the case study analyzes responses across different query types", + "source_ids": [ + 181 + ], + "source": "Name: query types\nType: TASK_OR_PROBLEM", + "target": "Name: case study\nType: EVENT" + }, + { + "src_entity_name": "gradient er", + "tgt_entity_name": "basic er", + "relation_name": "", + "weight": 10.0, + "description": "basic er replaces gradient er by merging same name entities", + "source_ids": [ + 165 + ], + "source": "Name: gradient er\nType: METHOD_OR_TECHNIQUE", + "target": "Name: basic er\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "w o gradient er", + "tgt_entity_name": "gradient er", + "relation_name": "", + "weight": 9.0, + "description": "the w o gradient er scenario involves the replacement of gradient er", + "source_ids": [ + 165 + ], + "source": "Name: gradient er\nType: METHOD_OR_TECHNIQUE", + "target": "Name: w o gradient er\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "w o gradient er", + "tgt_entity_name": "basic er", + "relation_name": "", + "weight": 9.0, + "description": "the w o gradient er scenario involves the use of basic er as the replacement method", + "source_ids": [ + 165 + ], + "source": "Name: basic er\nType: METHOD_OR_TECHNIQUE", + "target": "Name: w o gradient er\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "basic er", + "tgt_entity_name": "same name entities", + "relation_name": "", + "weight": 10.0, + "description": "basic er is the method used to merge same name entities", + "source_ids": [ + 165 + ], + "source": "Name: basic er\nType: METHOD_OR_TECHNIQUE", + "target": "Name: same name entities\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "selector operators", + "tgt_entity_name": "reasoners", + "relation_name": "", + "weight": 9.0, + "description": "removing selector operators directly changes how reasoners operate by forcing them to score all candidate nodes", + "source_ids": [ + 167 + ], + "source": "Name: reasoners\nType: TECHNOLOGY", + "target": "Name: selector operators\nType: TECHNOLOGY" + }, + { + "src_entity_name": "reasoners", + "tgt_entity_name": "candidate nodes", + "relation_name": "", + "weight": 8.0, + "description": "reasoners perform the action of scoring candidate nodes especially when selector operators are absent", + "source_ids": [ + 167 + ], + "source": "Name: reasoners\nType: TECHNOLOGY", + "target": "Name: candidate nodes\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "w o gradient er variant", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "the w o gradient er variant highlights the critical role of the kg in the system", + "source_ids": [ + 172 + ], + "source": "Name: kg\nType: DATASET_OR_CORPUS", + "target": "Name: w o gradient er variant\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "w o selector variant", + "tgt_entity_name": "ift inspired selection mechanism", + "relation_name": "", + "weight": 8.0, + "description": "the w o selector variant validates the efficiency of the ift inspired selection mechanism", + "source_ids": [ + 172 + ], + "source": "Name: ift inspired selection mechanism\nType: METHOD_OR_TECHNIQUE", + "target": "Name: w o selector variant\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 6", + "tgt_entity_name": "basic setting", + "relation_name": "", + "weight": 9.0, + "description": "figure 6 compares graph statistics by normalizing values to the basic setting", + "source_ids": [ + 174 + ], + "source": "Name: figure 6\nType: IMAGE", + "target": "Name: basic setting\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 6", + "tgt_entity_name": "3 6e 3", + "relation_name": "", + "weight": 8.0, + "description": "figure 6 contains the density value 3 6e 3 as an example of abbreviated notation", + "source_ids": [ + 174 + ], + "source": "Name: figure 6\nType: IMAGE", + "target": "Name: 3 6e 3\nType: MEASUREMENT" + }, + { + "src_entity_name": "figure 6", + "tgt_entity_name": "graph statistics", + "relation_name": "", + "weight": 10.0, + "description": "figure 6 is a comparison of graph statistics", + "source_ids": [ + 174 + ], + "source": "Name: figure 6\nType: IMAGE", + "target": "Name: graph statistics\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 6", + "tgt_entity_name": "absolute values", + "relation_name": "", + "weight": 8.0, + "description": "figure 6 includes annotations of absolute values for the basic setting", + "source_ids": [ + 174 + ], + "source": "Name: figure 6\nType: IMAGE", + "target": "Name: absolute values\nType: MEASUREMENT" + }, + { + "src_entity_name": "figure 6", + "tgt_entity_name": "density values", + "relation_name": "", + "weight": 9.0, + "description": "figure 6 illustrates how density values are abbreviated using 3 6e 3 as an example", + "source_ids": [ + 174 + ], + "source": "Name: figure 6\nType: IMAGE", + "target": "Name: density values\nType: MEASUREMENT" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "figure 6", + "relation_name": "", + "weight": 8.0, + "description": "figure 6 presents the results of the comparison involving gradient based entity resolution", + "source_ids": [ + 176 + ], + "source": "Name: figure 6\nType: IMAGE", + "target": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "basic setting", + "tgt_entity_name": "absolute values", + "relation_name": "", + "weight": 7.0, + "description": "absolute values are specifically annotated for the basic setting", + "source_ids": [ + 174 + ], + "source": "Name: basic setting\nType: TASK_OR_PROBLEM", + "target": "Name: absolute values\nType: MEASUREMENT" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "basic", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Basic", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: basic\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "gradient-based er", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Gradient-based ER", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: gradient-based er\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "ratio", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Ratio", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: ratio\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "# entity", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to # Entity", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: # entity\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "density", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Density", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: density\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "diameter", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Diameter", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: diameter\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "# cc", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to # CC", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: # cc\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "figure (a)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Figure (a)", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: figure (a)\nType: SECTION_TITLE" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "figure (b)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Figure (b)", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: figure (b)\nType: SECTION_TITLE" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "1327", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 1327", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: 1327\nType: MEASUREMENT" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "3.6e-3", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 3.6E-3", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: 3.6e-3\nType: MEASUREMENT" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "14.8", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 14.8", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: 14.8\nType: MEASUREMENT" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "169", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 169", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: 169\nType: MEASUREMENT" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "531", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 531", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: 531\nType: MEASUREMENT" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "5.4e-3", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 5.4e-3", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: 5.4e-3\nType: MEASUREMENT" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "15.0", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 15.0", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: 15.0\nType: MEASUREMENT" + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "106", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 106", + "source_ids": [ + 175 + ], + "source": "Name: cref='#/texts/224'\nType: IMAGE", + "target": "Name: 106\nType: MEASUREMENT" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "basic kg construction", + "relation_name": "", + "weight": 9.0, + "description": "gradient based entity resolution is compared against basic kg construction to evaluate quality", + "source_ids": [ + 176 + ], + "source": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: basic kg construction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "entity count", + "relation_name": "", + "weight": 9.0, + "description": "gradient based entity resolution reduces the entity count by 12 compared to the baseline", + "source_ids": [ + 176 + ], + "source": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: entity count\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "density", + "relation_name": "", + "weight": 9.0, + "description": "gradient based entity resolution boosts graph density by over 20 across datasets", + "source_ids": [ + 176 + ], + "source": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: density\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "diameter of the largest connected component", + "relation_name": "", + "weight": 8.0, + "description": "gradient based entity resolution reduces the diameter of the largest connected component indicating a more compact graph", + "source_ids": [ + 176 + ], + "source": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: diameter of the largest connected component\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "number of connected components", + "relation_name": "", + "weight": 8.0, + "description": "gradient based entity resolution reduces the number of connected components mitigating fragmentation", + "source_ids": [ + 176 + ], + "source": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: number of connected components\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "er module", + "relation_name": "", + "weight": 10.0, + "description": "the er module is the specific component of gradient based entity resolution that identifies entities", + "source_ids": [ + 176 + ], + "source": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: er module\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "basic baseline", + "relation_name": "", + "weight": 9.0, + "description": "gradient based entity resolution is evaluated against the basic baseline to demonstrate optimization", + "source_ids": [ + 176 + ], + "source": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: basic baseline\nType: BENCHMARK" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "12", + "relation_name": "", + "weight": 10.0, + "description": "the gradient based er method achieves a 12 reduction in entity count", + "source_ids": [ + 176 + ], + "source": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: 12\nType: PERCENTAGE" + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "graph reasoning", + "relation_name": "", + "weight": 9.0, + "description": "the structural improvements from gradient based entity resolution facilitate better graph reasoning", + "source_ids": [ + 176 + ], + "source": "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: graph reasoning\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "basic kg construction", + "tgt_entity_name": "many graph based methods", + "relation_name": "", + "weight": 9.0, + "description": "basic kg construction is standard practice in many graph based methods", + "source_ids": [ + 176 + ], + "source": "Name: basic kg construction\nType: TASK_OR_PROBLEM", + "target": "Name: many graph based methods\nType: ORGANIZATION" + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "multi hop", + "relation_name": "", + "weight": 9.0, + "description": "figure 7 displays the performance breakdown for the multi hop query type", + "source_ids": [ + 177 + ], + "source": "Name: figure 7\nType: IMAGE", + "target": "Name: multi hop\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 9.0, + "description": "figure 7 displays the performance breakdown for the global query type", + "source_ids": [ + 177 + ], + "source": "Name: figure 7\nType: IMAGE", + "target": "Name: global\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "blue bars", + "relation_name": "", + "weight": 8.0, + "description": "figure 7 contains blue bars to represent specific metrics", + "source_ids": [ + 177 + ], + "source": "Name: figure 7\nType: IMAGE", + "target": "Name: blue bars\nType: IMAGE" + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "red bars", + "relation_name": "", + "weight": 8.0, + "description": "figure 7 contains red bars to represent specific metrics", + "source_ids": [ + 177 + ], + "source": "Name: figure 7\nType: IMAGE", + "target": "Name: red bars\nType: IMAGE" + }, + { + "src_entity_name": "multi hop", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 5.0, + "description": "both are listed as distinct query types in the performance breakdown", + "source_ids": [ + 177 + ], + "source": "Name: multi hop\nType: TASK_OR_PROBLEM", + "target": "Name: global\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to Global", + "source_ids": [ + 178 + ], + "source": "Name: global\nType: TASK_OR_PROBLEM", + "target": "Name: cref='#/texts/259'\nType: IMAGE" + }, + { + "src_entity_name": "expert query analyzer", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 9.0, + "description": "the expert query analyzer classifies questions into the global category", + "source_ids": [ + 241 + ], + "source": "Name: global\nType: TASK_OR_PROBLEM", + "target": "Name: expert query analyzer\nType: PERSON" + }, + { + "src_entity_name": "global", + "tgt_entity_name": "aggregation operation", + "relation_name": "", + "weight": 8.0, + "description": "the global task requires an aggregation operation such as counting listing or summarizing", + "source_ids": [ + 250 + ], + "source": "Name: global\nType: TASK_OR_PROBLEM", + "target": "Name: aggregation operation\nType: UNKNOWN" + }, + { + "src_entity_name": "global", + "tgt_entity_name": "counting", + "relation_name": "", + "weight": 9.0, + "description": "the global task includes counting as a possible aggregation operation", + "source_ids": [ + 250 + ], + "source": "Name: global\nType: TASK_OR_PROBLEM", + "target": "Name: counting\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "global", + "tgt_entity_name": "listing", + "relation_name": "", + "weight": 9.0, + "description": "the global task includes listing as a possible aggregation operation", + "source_ids": [ + 250 + ], + "source": "Name: global\nType: TASK_OR_PROBLEM", + "target": "Name: listing\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "global", + "tgt_entity_name": "summarizing", + "relation_name": "", + "weight": 9.0, + "description": "the global task includes summarizing as a possible aggregation operation", + "source_ids": [ + 250 + ], + "source": "Name: global\nType: TASK_OR_PROBLEM", + "target": "Name: summarizing\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "global", + "tgt_entity_name": "structural filter", + "relation_name": "", + "weight": 9.0, + "description": "the global task identifies items using a clear structural filter", + "source_ids": [ + 250 + ], + "source": "Name: global\nType: TASK_OR_PROBLEM", + "target": "Name: structural filter\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "em / accuracy", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to EM / Accuracy", + "source_ids": [ + 178 + ], + "source": "Name: cref='#/texts/259'\nType: IMAGE", + "target": "Name: em / accuracy\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "f1-score", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to F1-score", + "source_ids": [ + 178 + ], + "source": "Name: cref='#/texts/259'\nType: IMAGE", + "target": "Name: f1-score\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "single", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to Single", + "source_ids": [ + 178 + ], + "source": "Name: cref='#/texts/259'\nType: IMAGE", + "target": "Name: single\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "multi", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to Multi", + "source_ids": [ + 178 + ], + "source": "Name: cref='#/texts/259'\nType: IMAGE", + "target": "Name: multi\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "(a) mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to (a) MMLongBench", + "source_ids": [ + 178 + ], + "source": "Name: cref='#/texts/259'\nType: IMAGE", + "target": "Name: (a) mmlongbench\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "(b) qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to (b) Qasper", + "source_ids": [ + 178 + ], + "source": "Name: cref='#/texts/259'\nType: IMAGE", + "target": "Name: (b) qasper\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "(a) mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to (a) MMLongBench", + "source_ids": [ + 184 + ], + "source": "Name: (a) mmlongbench\nType: DATASET_OR_CORPUS", + "target": "Name: cref='#/texts/348'\nType: IMAGE" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "(b) qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to (b) Qasper", + "source_ids": [ + 184 + ], + "source": "Name: (b) qasper\nType: DATASET_OR_CORPUS", + "target": "Name: cref='#/texts/348'\nType: IMAGE" + }, + { + "src_entity_name": "multihop", + "tgt_entity_name": "agent based planning strategy", + "relation_name": "", + "weight": 8.0, + "description": "the agent based planning strategy is validated by its ability to handle multihop queries", + "source_ids": [ + 179 + ], + "source": "Name: multihop\nType: TASK_OR_PROBLEM", + "target": "Name: agent based planning strategy\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "multihop", + "tgt_entity_name": "disjoint pieces of evidence", + "relation_name": "", + "weight": 9.0, + "description": "multihop queries are challenging because they require retrieving and reasoning over disjoint pieces of evidence", + "source_ids": [ + 179 + ], + "source": "Name: multihop\nType: TASK_OR_PROBLEM", + "target": "Name: disjoint pieces of evidence\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "retrieving", + "tgt_entity_name": "disjoint pieces of evidence", + "relation_name": "", + "weight": 8.0, + "description": "retrieving is the action performed on disjoint pieces of evidence", + "source_ids": [ + 179 + ], + "source": "Name: disjoint pieces of evidence\nType: DATASET_OR_CORPUS", + "target": "Name: retrieving\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "error response analysis", + "tgt_entity_name": "figure 9", + "relation_name": "", + "weight": 9.0, + "description": "the analysis traces error propagation as shown in figure 9", + "source_ids": [ + 180 + ], + "source": "Name: figure 9\nType: IMAGE", + "target": "Name: error response analysis\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "figure 9", + "tgt_entity_name": "error analysis", + "relation_name": "", + "weight": 10.0, + "description": "figure 9 displays the results of the error analysis", + "source_ids": [ + 183 + ], + "source": "Name: figure 9\nType: IMAGE", + "target": "Name: error analysis\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "error response analysis", + "tgt_entity_name": "200 sampled queries", + "relation_name": "", + "weight": 9.0, + "description": "the analysis is conducted on 200 sampled queries from each dataset", + "source_ids": [ + 180 + ], + "source": "Name: error response analysis\nType: TASK_OR_PROBLEM", + "target": "Name: 200 sampled queries\nType: MEASUREMENT" + }, + { + "src_entity_name": "error response analysis", + "tgt_entity_name": "four types", + "relation_name": "", + "weight": 8.0, + "description": "the analysis categorizes failures into four types", + "source_ids": [ + 180 + ], + "source": "Name: error response analysis\nType: TASK_OR_PROBLEM", + "target": "Name: four types\nType: MEASUREMENT" + }, + { + "src_entity_name": "figure 8", + "tgt_entity_name": "case study", + "relation_name": "", + "weight": 10.0, + "description": "figure 8 presents the case study", + "source_ids": [ + 181 + ], + "source": "Name: figure 8\nType: IMAGE", + "target": "Name: case study\nType: EVENT" + }, + { + "src_entity_name": "gray text", + "tgt_entity_name": "internal process", + "relation_name": "", + "weight": 10.0, + "description": "gray text describes the internal process", + "source_ids": [ + 181 + ], + "source": "Name: gray text\nType: COLOR", + "target": "Name: internal process\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "bookrag response of different query types", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to BookRAG response of different query types", + "source_ids": [ + 182 + ], + "source": "Name: bookrag response of different query types\nType: IMAGE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "single-hop case from qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Single-hop Case from Qasper", + "source_ids": [ + 182 + ], + "source": "Name: single-hop case from qasper\nType: SECTION_TITLE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "select_by_entity operator", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Select_by_Entity operator", + "source_ids": [ + 182 + ], + "source": "Name: select_by_entity operator\nType: SOFTWARE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "graph_reasoning", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Graph_Reasoning", + "source_ids": [ + 182 + ], + "source": "Name: graph_reasoning\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "text_reasoning", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Text_Reasoning", + "source_ids": [ + 182 + ], + "source": "Name: text_reasoning\nType: TASK_OR_PROBLEM", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "skyline_ranker", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Skyline_Ranker", + "source_ids": [ + 182 + ], + "source": "Name: skyline_ranker\nType: SOFTWARE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "binary reward system", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to binary reward system", + "source_ids": [ + 182 + ], + "source": "Name: binary reward system\nType: TECHNOLOGY", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "discount factor", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to discount factor", + "source_ids": [ + 182 + ], + "source": "Name: discount factor\nType: PARAMETER_OR_VARIABLE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "multi-hop case from qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Multi-hop Case from Qasper", + "source_ids": [ + 182 + ], + "source": "Name: multi-hop case from qasper\nType: SECTION_TITLE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "interpretable system", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Interpretable system", + "source_ids": [ + 182 + ], + "source": "Name: interpretable system\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "lstm with elmo system", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to LSTM with ELMo system", + "source_ids": [ + 182 + ], + "source": "Name: lstm with elmo system\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "lstm-elmo net", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to LSTM-ELMo net", + "source_ids": [ + 182 + ], + "source": "Name: lstm-elmo net\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "diacritic swapping", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Diacritic swapping", + "source_ids": [ + 182 + ], + "source": "Name: diacritic swapping\nType: METHOD_OR_TECHNIQUE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "cross-entropy", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to cross-entropy", + "source_ids": [ + 182 + ], + "source": "Name: cross-entropy\nType: EVALUATION_METRIC", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "decompose operator", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Decompose operator", + "source_ids": [ + 182 + ], + "source": "Name: decompose operator\nType: SOFTWARE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "global aggregation case from mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Global Aggregation Case from MMLongBench", + "source_ids": [ + 182 + ], + "source": "Name: global aggregation case from mmlongbench\nType: SECTION_TITLE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "filter operators", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Filter operators", + "source_ids": [ + 182 + ], + "source": "Name: filter operators\nType: SOFTWARE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "filter_range", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Filter_Range", + "source_ids": [ + 182 + ], + "source": "Name: filter_range\nType: SOFTWARE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "filter_modal", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Filter_Modal", + "source_ids": [ + 182 + ], + "source": "Name: filter_modal\nType: SOFTWARE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Reduce", + "source_ids": [ + 182 + ], + "source": "Name: reduce\nType: SOFTWARE", + "target": "Name: image cref='#/texts/282'\nType: UNKNOWN" + }, + { + "src_entity_name": "error analysis", + "tgt_entity_name": "200", + "relation_name": "", + "weight": 9.0, + "description": "the error analysis was conducted on 200 sampled queries", + "source_ids": [ + 183 + ], + "source": "Name: 200\nType: MEASUREMENT", + "target": "Name: error analysis\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "all queries (200)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to All Queries (200)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: all queries (200)\nType: MEASUREMENT" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "successful parsing (194)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Successful Parsing (194)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: successful parsing (194)\nType: MEASUREMENT" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "retrieval error (52)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Retrieval Error (52)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: retrieval error (52)\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "generation error (36)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Generation Error (36)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: generation error (36)\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "plan error (27)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Plan Error (27)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: plan error (27)\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "parsing error (6)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Parsing Error (6)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: parsing error (6)\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "correct (79)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Correct (79)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: correct (79)\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "successful parsing (193)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Successful Parsing (193)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: successful parsing (193)\nType: MEASUREMENT" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "generation error (30)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Generation Error (30)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: generation error (30)\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "retrieval error (26)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Retrieval Error (26)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: retrieval error (26)\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "plan error (20)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Plan Error (20)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: plan error (20)\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "parsing error (7)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Parsing Error (7)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: parsing error (7)\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "correct (117)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Correct (117)", + "source_ids": [ + 184 + ], + "source": "Name: cref='#/texts/348'\nType: IMAGE", + "target": "Name: correct (117)\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "retrieval error", + "tgt_entity_name": "multimodal evidence", + "relation_name": "", + "weight": 8.0, + "description": "retrieval error reflects the challenge of locating multimodal evidence", + "source_ids": [ + 185 + ], + "source": "Name: retrieval error\nType: TASK_OR_PROBLEM", + "target": "Name: multimodal evidence\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "results", + "tgt_entity_name": "retrieval error", + "relation_name": "", + "weight": 10.0, + "description": "the results identify retrieval error as the dominant failure mode", + "source_ids": [ + 185 + ], + "source": "Name: retrieval error\nType: TASK_OR_PROBLEM", + "target": "Name: results\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "retrieval error", + "tgt_entity_name": "generation error", + "relation_name": "", + "weight": 7.0, + "description": "retrieval error is the dominant failure mode followed by generation error", + "source_ids": [ + 185 + ], + "source": "Name: retrieval error\nType: TASK_OR_PROBLEM", + "target": "Name: generation error\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "generation error", + "tgt_entity_name": "multimodal evidence", + "relation_name": "", + "weight": 8.0, + "description": "generation error reflects the challenge of synthesizing multimodal evidence", + "source_ids": [ + 185 + ], + "source": "Name: generation error\nType: TASK_OR_PROBLEM", + "target": "Name: multimodal evidence\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "results", + "tgt_entity_name": "generation error", + "relation_name": "", + "weight": 10.0, + "description": "the results identify generation error as the second most common failure mode", + "source_ids": [ + 185 + ], + "source": "Name: generation error\nType: TASK_OR_PROBLEM", + "target": "Name: results\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "plan error", + "tgt_entity_name": "single hop queries", + "relation_name": "", + "weight": 9.0, + "description": "plan error involves the over decomposition of single hop queries", + "source_ids": [ + 185 + ], + "source": "Name: plan error\nType: TASK_OR_PROBLEM", + "target": "Name: single hop queries\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "plan error", + "tgt_entity_name": "multi hop sub tasks", + "relation_name": "", + "weight": 9.0, + "description": "plan error leads to the creation of unnecessary multi hop sub tasks", + "source_ids": [ + 185 + ], + "source": "Name: plan error\nType: TASK_OR_PROBLEM", + "target": "Name: multi hop sub tasks\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "plan error", + "tgt_entity_name": "disjointed retrieval paths", + "relation_name": "", + "weight": 9.0, + "description": "plan error causes fragmentation leading to disjointed retrieval paths", + "source_ids": [ + 185 + ], + "source": "Name: plan error\nType: TASK_OR_PROBLEM", + "target": "Name: disjointed retrieval paths\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "planner", + "tgt_entity_name": "plan error", + "relation_name": "", + "weight": 10.0, + "description": "the planner is the agent responsible for the plan error failure pattern", + "source_ids": [ + 185 + ], + "source": "Name: plan error\nType: TASK_OR_PROBLEM", + "target": "Name: planner\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "qualitative analysis", + "tgt_entity_name": "plan error", + "relation_name": "", + "weight": 9.0, + "description": "qualitative analysis reveals the specific failure pattern of plan error", + "source_ids": [ + 185 + ], + "source": "Name: plan error\nType: TASK_OR_PROBLEM", + "target": "Name: qualitative analysis\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "planner", + "tgt_entity_name": "single hop queries", + "relation_name": "", + "weight": 9.0, + "description": "the planner acts upon single hop queries by over decomposing them", + "source_ids": [ + 185 + ], + "source": "Name: single hop queries\nType: TASK_OR_PROBLEM", + "target": "Name: planner\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "disjointed retrieval paths", + "tgt_entity_name": "cohesive final answer", + "relation_name": "", + "weight": 9.0, + "description": "disjointed retrieval paths prevent the model from synthesizing a cohesive final answer", + "source_ids": [ + 185 + ], + "source": "Name: disjointed retrieval paths\nType: TASK_OR_PROBLEM", + "target": "Name: cohesive final answer\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "fragmentation", + "tgt_entity_name": "disjointed retrieval paths", + "relation_name": "", + "weight": 9.0, + "description": "fragmentation leads directly to disjointed retrieval paths", + "source_ids": [ + 185 + ], + "source": "Name: disjointed retrieval paths\nType: TASK_OR_PROBLEM", + "target": "Name: fragmentation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "model", + "tgt_entity_name": "cohesive final answer", + "relation_name": "", + "weight": 9.0, + "description": "the model attempts to synthesize a cohesive final answer but is prevented from doing so", + "source_ids": [ + 185 + ], + "source": "Name: cohesive final answer\nType: TASK_OR_PROBLEM", + "target": "Name: model\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "model", + "tgt_entity_name": "scattered sub responses", + "relation_name": "", + "weight": 8.0, + "description": "the model receives scattered sub responses which it fails to synthesize", + "source_ids": [ + 185 + ], + "source": "Name: model\nType: TASK_OR_PROBLEM", + "target": "Name: scattered sub responses\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "book index", + "tgt_entity_name": "tree graph index", + "relation_name": "", + "weight": 9.0, + "description": "book index is specifically a structured tree graph index", + "source_ids": [ + 188 + ], + "source": "Name: book index\nType: PRODUCT", + "target": "Name: tree graph index\nType: TECHNOLOGY" + }, + { + "src_entity_name": "agent based method", + "tgt_entity_name": "retrieval precision", + "relation_name": "", + "weight": 6.0, + "description": "the agent based method is used to configure operators that affect retrieval precision", + "source_ids": [ + 188 + ], + "source": "Name: agent based method\nType: METHOD_OR_TECHNIQUE", + "target": "Name: retrieval precision\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "agent based method", + "tgt_entity_name": "answer accuracy", + "relation_name": "", + "weight": 6.0, + "description": "the agent based method is used to configure operators that affect answer accuracy", + "source_ids": [ + 188 + ], + "source": "Name: agent based method\nType: METHOD_OR_TECHNIQUE", + "target": "Name: answer accuracy\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "agent based method", + "tgt_entity_name": "retrieval operators", + "relation_name": "", + "weight": 9.0, + "description": "the agent based method dynamically configures retrieval operators", + "source_ids": [ + 188 + ], + "source": "Name: agent based method\nType: METHOD_OR_TECHNIQUE", + "target": "Name: retrieval operators\nType: SOFTWARE" + }, + { + "src_entity_name": "agent based method", + "tgt_entity_name": "reasoning operators", + "relation_name": "", + "weight": 9.0, + "description": "the agent based method dynamically configures reasoning operators", + "source_ids": [ + 188 + ], + "source": "Name: agent based method\nType: METHOD_OR_TECHNIQUE", + "target": "Name: reasoning operators\nType: SOFTWARE" + }, + { + "src_entity_name": "document native database system", + "tgt_entity_name": "data formatting", + "relation_name": "", + "weight": 8.0, + "description": "the future database system supports data formatting", + "source_ids": [ + 188 + ], + "source": "Name: document native database system\nType: PRODUCT", + "target": "Name: data formatting\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "document native database system", + "tgt_entity_name": "knowledge extraction", + "relation_name": "", + "weight": 8.0, + "description": "the future database system supports knowledge extraction", + "source_ids": [ + 188 + ], + "source": "Name: document native database system\nType: PRODUCT", + "target": "Name: knowledge extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "document native database system", + "tgt_entity_name": "intelligent querying", + "relation_name": "", + "weight": 8.0, + "description": "the future database system supports intelligent querying", + "source_ids": [ + 188 + ], + "source": "Name: document native database system\nType: PRODUCT", + "target": "Name: intelligent querying\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "brandon yang", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: simran arora\nType: PERSON", + "target": "Name: brandon yang\nType: PERSON" + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "sabri eyuboglu", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: simran arora\nType: PERSON", + "target": "Name: sabri eyuboglu\nType: PERSON" + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "avanika narayan", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: simran arora\nType: PERSON", + "target": "Name: avanika narayan\nType: PERSON" + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "andrew hojel", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: simran arora\nType: PERSON", + "target": "Name: andrew hojel\nType: PERSON" + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "immanuel trummer", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: simran arora\nType: PERSON", + "target": "Name: immanuel trummer\nType: PERSON" + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: simran arora\nType: PERSON", + "target": "Name: christopher r\nType: PERSON" + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "2023", + "relation_name": "", + "weight": 8.0, + "description": "simran arora is an author of a paper published in 2023", + "source_ids": [ + 191 + ], + "source": "Name: simran arora\nType: PERSON", + "target": "Name: 2023\nType: DATE" + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "simple systems", + "relation_name": "", + "weight": 8.0, + "description": "simran arora is an author of the paper describing simple systems", + "source_ids": [ + 191 + ], + "source": "Name: simran arora\nType: PERSON", + "target": "Name: simple systems\nType: PRODUCT" + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "structured views", + "relation_name": "", + "weight": 8.0, + "description": "simran arora is an author of the paper describing structured views", + "source_ids": [ + 191 + ], + "source": "Name: simran arora\nType: PERSON", + "target": "Name: structured views\nType: PRODUCT" + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "heterogeneous data lakes", + "relation_name": "", + "weight": 8.0, + "description": "simran arora is an author of the paper describing heterogeneous data lakes", + "source_ids": [ + 191 + ], + "source": "Name: simran arora\nType: PERSON", + "target": "Name: heterogeneous data lakes\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "brandon yang", + "tgt_entity_name": "sabri eyuboglu", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: brandon yang\nType: PERSON", + "target": "Name: sabri eyuboglu\nType: PERSON" + }, + { + "src_entity_name": "brandon yang", + "tgt_entity_name": "avanika narayan", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: brandon yang\nType: PERSON", + "target": "Name: avanika narayan\nType: PERSON" + }, + { + "src_entity_name": "brandon yang", + "tgt_entity_name": "andrew hojel", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: brandon yang\nType: PERSON", + "target": "Name: andrew hojel\nType: PERSON" + }, + { + "src_entity_name": "brandon yang", + "tgt_entity_name": "immanuel trummer", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: brandon yang\nType: PERSON", + "target": "Name: immanuel trummer\nType: PERSON" + }, + { + "src_entity_name": "brandon yang", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: brandon yang\nType: PERSON", + "target": "Name: christopher r\nType: PERSON" + }, + { + "src_entity_name": "sabri eyuboglu", + "tgt_entity_name": "avanika narayan", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: sabri eyuboglu\nType: PERSON", + "target": "Name: avanika narayan\nType: PERSON" + }, + { + "src_entity_name": "sabri eyuboglu", + "tgt_entity_name": "andrew hojel", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: sabri eyuboglu\nType: PERSON", + "target": "Name: andrew hojel\nType: PERSON" + }, + { + "src_entity_name": "sabri eyuboglu", + "tgt_entity_name": "immanuel trummer", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: sabri eyuboglu\nType: PERSON", + "target": "Name: immanuel trummer\nType: PERSON" + }, + { + "src_entity_name": "sabri eyuboglu", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: sabri eyuboglu\nType: PERSON", + "target": "Name: christopher r\nType: PERSON" + }, + { + "src_entity_name": "avanika narayan", + "tgt_entity_name": "andrew hojel", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: avanika narayan\nType: PERSON", + "target": "Name: andrew hojel\nType: PERSON" + }, + { + "src_entity_name": "avanika narayan", + "tgt_entity_name": "immanuel trummer", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: avanika narayan\nType: PERSON", + "target": "Name: immanuel trummer\nType: PERSON" + }, + { + "src_entity_name": "avanika narayan", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: avanika narayan\nType: PERSON", + "target": "Name: christopher r\nType: PERSON" + }, + { + "src_entity_name": "andrew hojel", + "tgt_entity_name": "immanuel trummer", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: andrew hojel\nType: PERSON", + "target": "Name: immanuel trummer\nType: PERSON" + }, + { + "src_entity_name": "andrew hojel", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: andrew hojel\nType: PERSON", + "target": "Name: christopher r\nType: PERSON" + }, + { + "src_entity_name": "immanuel trummer", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ], + "source": "Name: immanuel trummer\nType: PERSON", + "target": "Name: christopher r\nType: PERSON" + }, + { + "src_entity_name": "language models", + "tgt_entity_name": "heterogeneous data lakes", + "relation_name": "", + "weight": 10.0, + "description": "language models enable the generation of structured views of heterogeneous data lakes", + "source_ids": [ + 191 + ], + "source": "Name: language models\nType: TECHNOLOGY", + "target": "Name: heterogeneous data lakes\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "language models", + "tgt_entity_name": "simple systems", + "relation_name": "", + "weight": 10.0, + "description": "language models enable the creation of simple systems", + "source_ids": [ + 191 + ], + "source": "Name: language models\nType: TECHNOLOGY", + "target": "Name: simple systems\nType: PRODUCT" + }, + { + "src_entity_name": "simple systems", + "tgt_entity_name": "heterogeneous data lakes", + "relation_name": "", + "weight": 9.0, + "description": "simple systems generate views of heterogeneous data lakes", + "source_ids": [ + 191 + ], + "source": "Name: heterogeneous data lakes\nType: DATASET_OR_CORPUS", + "target": "Name: simple systems\nType: PRODUCT" + }, + { + "src_entity_name": "simple systems", + "tgt_entity_name": "structured views", + "relation_name": "", + "weight": 9.0, + "description": "simple systems are used for generating structured views", + "source_ids": [ + 191 + ], + "source": "Name: simple systems\nType: PRODUCT", + "target": "Name: structured views\nType: PRODUCT" + }, + { + "src_entity_name": "self rag", + "tgt_entity_name": "2023", + "relation_name": "", + "weight": 10.0, + "description": "the self rag paper was published in the year 2023", + "source_ids": [ + 193 + ], + "source": "Name: 2023\nType: DATE", + "target": "Name: self rag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "2023", + "relation_name": "", + "weight": 9.0, + "description": "haipipe was published in the year 2023", + "source_ids": [ + 200 + ], + "source": "Name: 2023\nType: DATE", + "target": "Name: haipipe\nType: PRODUCT" + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "2023", + "relation_name": "", + "weight": 8.0, + "description": "xavier daull is an author of the work published in 2023", + "source_ids": [ + 205 + ], + "source": "Name: 2023\nType: DATE", + "target": "Name: xavier daull\nType: PERSON" + }, + { + "src_entity_name": "retrieval augmented generation for large language models a survey", + "tgt_entity_name": "2023", + "relation_name": "", + "weight": 9.0, + "description": "the survey was published in the year 2023", + "source_ids": [ + 207 + ], + "source": "Name: 2023\nType: DATE", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "akari asai is an author of the paper describing the self rag model", + "source_ids": [ + 193 + ], + "source": "Name: akari asai\nType: PERSON", + "target": "Name: self rag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "international conference on learning representations", + "relation_name": "", + "weight": 8.0, + "description": "akari asai s paper was published at the international conference on learning representations", + "source_ids": [ + 192 + ], + "source": "Name: akari asai\nType: PERSON", + "target": "Name: international conference on learning representations\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "zeqiu wu", + "relation_name": "", + "weight": 8.0, + "description": "akari asai and zeqiu wu are co authors on the same paper", + "source_ids": [ + 193 + ], + "source": "Name: akari asai\nType: PERSON", + "target": "Name: zeqiu wu\nType: PERSON" + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "yizhong wang", + "relation_name": "", + "weight": 8.0, + "description": "akari asai and yizhong wang are co authors on the same paper", + "source_ids": [ + 193 + ], + "source": "Name: akari asai\nType: PERSON", + "target": "Name: yizhong wang\nType: PERSON" + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "akari asai is listed alongside other authors et al on the same paper", + "source_ids": [ + 192 + ], + "source": "Name: akari asai\nType: PERSON", + "target": "Name: et al\nType: PERSON" + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "avirup sil", + "relation_name": "", + "weight": 8.0, + "description": "akari asai and avirup sil are co authors on the same paper", + "source_ids": [ + 193 + ], + "source": "Name: akari asai\nType: PERSON", + "target": "Name: avirup sil\nType: PERSON" + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "hannaneh hajishirzi", + "relation_name": "", + "weight": 8.0, + "description": "akari asai and hannaneh hajishirzi are co authors on the same paper", + "source_ids": [ + 193 + ], + "source": "Name: akari asai\nType: PERSON", + "target": "Name: hannaneh hajishirzi\nType: PERSON" + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "zeqiu wu is an author of the paper describing the self rag model", + "source_ids": [ + 193 + ], + "source": "Name: zeqiu wu\nType: PERSON", + "target": "Name: self rag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "international conference on learning representations", + "relation_name": "", + "weight": 8.0, + "description": "zeqiu wu s paper was published at the international conference on learning representations", + "source_ids": [ + 192 + ], + "source": "Name: zeqiu wu\nType: PERSON", + "target": "Name: international conference on learning representations\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "yizhong wang", + "relation_name": "", + "weight": 8.0, + "description": "zeqiu wu and yizhong wang are co authors on the same paper", + "source_ids": [ + 193 + ], + "source": "Name: zeqiu wu\nType: PERSON", + "target": "Name: yizhong wang\nType: PERSON" + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "zeqiu wu is listed alongside other authors et al on the same paper", + "source_ids": [ + 192 + ], + "source": "Name: zeqiu wu\nType: PERSON", + "target": "Name: et al\nType: PERSON" + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "avirup sil", + "relation_name": "", + "weight": 8.0, + "description": "zeqiu wu and avirup sil are co authors on the same paper", + "source_ids": [ + 193 + ], + "source": "Name: zeqiu wu\nType: PERSON", + "target": "Name: avirup sil\nType: PERSON" + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "hannaneh hajishirzi", + "relation_name": "", + "weight": 8.0, + "description": "zeqiu wu and hannaneh hajishirzi are co authors on the same paper", + "source_ids": [ + 193 + ], + "source": "Name: zeqiu wu\nType: PERSON", + "target": "Name: hannaneh hajishirzi\nType: PERSON" + }, + { + "src_entity_name": "yizhong wang", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "yizhong wang is an author of the paper describing the self rag model", + "source_ids": [ + 193 + ], + "source": "Name: yizhong wang\nType: PERSON", + "target": "Name: self rag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "yizhong wang", + "tgt_entity_name": "international conference on learning representations", + "relation_name": "", + "weight": 8.0, + "description": "yizhong wang s paper was published at the international conference on learning representations", + "source_ids": [ + 192 + ], + "source": "Name: yizhong wang\nType: PERSON", + "target": "Name: international conference on learning representations\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "yizhong wang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "yizhong wang is listed alongside other authors et al on the same paper", + "source_ids": [ + 192 + ], + "source": "Name: yizhong wang\nType: PERSON", + "target": "Name: et al\nType: PERSON" + }, + { + "src_entity_name": "yizhong wang", + "tgt_entity_name": "avirup sil", + "relation_name": "", + "weight": 8.0, + "description": "yizhong wang and avirup sil are co authors on the same paper", + "source_ids": [ + 193 + ], + "source": "Name: yizhong wang\nType: PERSON", + "target": "Name: avirup sil\nType: PERSON" + }, + { + "src_entity_name": "yizhong wang", + "tgt_entity_name": "hannaneh hajishirzi", + "relation_name": "", + "weight": 8.0, + "description": "yizhong wang and hannaneh hajishirzi are co authors on the same paper", + "source_ids": [ + 193 + ], + "source": "Name: yizhong wang\nType: PERSON", + "target": "Name: hannaneh hajishirzi\nType: PERSON" + }, + { + "src_entity_name": "et al", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 8.0, + "description": "et al refers to co authors of the paper describing the self rag model", + "source_ids": [ + 192 + ], + "source": "Name: self rag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: et al\nType: PERSON" + }, + { + "src_entity_name": "self rag", + "tgt_entity_name": "international conference on learning representations", + "relation_name": "", + "weight": 10.0, + "description": "the self rag paper was published at the international conference on learning representations", + "source_ids": [ + 192 + ], + "source": "Name: self rag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: international conference on learning representations\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "avirup sil", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "avirup sil is an author of the paper describing the self rag model", + "source_ids": [ + 193 + ], + "source": "Name: self rag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: avirup sil\nType: PERSON" + }, + { + "src_entity_name": "hannaneh hajishirzi", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "hannaneh hajishirzi is an author of the paper describing the self rag model", + "source_ids": [ + 193 + ], + "source": "Name: self rag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: hannaneh hajishirzi\nType: PERSON" + }, + { + "src_entity_name": "self rag", + "tgt_entity_name": "arxiv preprint arxiv 2310 11511", + "relation_name": "", + "weight": 10.0, + "description": "the self rag paper is identified by the arxiv preprint number arxiv 2310 11511", + "source_ids": [ + 193 + ], + "source": "Name: self rag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: arxiv preprint arxiv 2310 11511\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "self rag", + "tgt_entity_name": "learning to retrieve generate and critique through self reflection", + "relation_name": "", + "weight": 10.0, + "description": "self rag is the model that implements the method of learning to retrieve generate and critique through self reflection", + "source_ids": [ + 193 + ], + "source": "Name: self rag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: learning to retrieve generate and critique through self reflection\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "et al", + "tgt_entity_name": "international conference on learning representations", + "relation_name": "", + "weight": 7.0, + "description": "the co authors referred to as et al published their paper at the international conference on learning representations", + "source_ids": [ + 192 + ], + "source": "Name: international conference on learning representations\nType: PUBLICATION_VENUE", + "target": "Name: et al\nType: PERSON" + }, + { + "src_entity_name": "international conference on learning representations", + "tgt_entity_name": "iclr", + "relation_name": "", + "weight": 10.0, + "description": "iclr is the abbreviation used for the international conference on learning representations in the text", + "source_ids": [ + 192 + ], + "source": "Name: international conference on learning representations\nType: PUBLICATION_VENUE", + "target": "Name: iclr\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "the paper about auto formula was published in 2024", + "source_ids": [ + 199 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: auto formula\nType: PRODUCT" + }, + { + "src_entity_name": "proceedings of the acm on management of data", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 10.0, + "description": "the publication year is 2024", + "source_ids": [ + 199 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "m3docrag was published in the year 2024", + "source_ids": [ + 201 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: m3docrag\nType: PRODUCT" + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "darren edge is an author of a document published in 2024", + "source_ids": [ + 206 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: darren edge\nType: PERSON" + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "ha trinh is an author of a document published in 2024", + "source_ids": [ + 206 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: ha trinh\nType: PERSON" + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "newman cheng is an author of a document published in 2024", + "source_ids": [ + 206 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: newman cheng\nType: PERSON" + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "joshua bradley is an author of a document published in 2024", + "source_ids": [ + 206 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: joshua bradley\nType: PERSON" + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "alex chao is an author of a document published in 2024", + "source_ids": [ + 206 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: alex chao\nType: PERSON" + }, + { + "src_entity_name": "apurva mody", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "apurva mody is an author of a document published in 2024", + "source_ids": [ + 206 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: apurva mody\nType: PERSON" + }, + { + "src_entity_name": "steven truitt", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "steven truitt is an author of a document published in 2024", + "source_ids": [ + 206 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: steven truitt\nType: PERSON" + }, + { + "src_entity_name": "jonathan larson", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "jonathan larson is an author of a document published in 2024", + "source_ids": [ + 206 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: jonathan larson\nType: PERSON" + }, + { + "src_entity_name": "2024", + "tgt_entity_name": "arxiv 2404 16130", + "relation_name": "", + "weight": 9.0, + "description": "the preprint arxiv 2404 16130 was published in 2024", + "source_ids": [ + 206 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: arxiv 2404 16130\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "lightrag was published in the year 2024", + "source_ids": [ + 208 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: lightrag\nType: PRODUCT" + }, + { + "src_entity_name": "hipporag", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 10.0, + "description": "the paper describing hipporag was published in the year 2024", + "source_ids": [ + 209 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: hipporag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "g retriever was published in the year 2024", + "source_ids": [ + 211 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper was published in the year 2024", + "source_ids": [ + 212 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: rag and rau a survey on retrieval augmented language model in natural language processing\nType: BOOK" + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "timo schick is an author of the work published in 2024", + "source_ids": [ + 216 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: timo schick\nType: PERSON" + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "jane dwivedi yu is an author of the work published in 2024", + "source_ids": [ + 216 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: jane dwivedi yu\nType: PERSON" + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "roberto dess is an author of the work published in 2024", + "source_ids": [ + 216 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: roberto dess\nType: PERSON" + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "roberta raileanu is an author of the work published in 2024", + "source_ids": [ + 216 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: roberta raileanu\nType: PERSON" + }, + { + "src_entity_name": "maria lomeli", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "maria lomeli is an author of the work published in 2024", + "source_ids": [ + 216 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: maria lomeli\nType: PERSON" + }, + { + "src_entity_name": "eric hambro", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "eric hambro is an author of the work published in 2024", + "source_ids": [ + 216 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: eric hambro\nType: PERSON" + }, + { + "src_entity_name": "luke zettlemoyer", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "luke zettlemoyer is an author of the work published in 2024", + "source_ids": [ + 216 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: luke zettlemoyer\nType: PERSON" + }, + { + "src_entity_name": "nicola cancedda", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "nicola cancedda is an author of the work published in 2024", + "source_ids": [ + 216 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: nicola cancedda\nType: PERSON" + }, + { + "src_entity_name": "thomas scialom", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "thomas scialom is an author of the work published in 2024", + "source_ids": [ + 216 + ], + "source": "Name: 2024\nType: DATE", + "target": "Name: thomas scialom\nType: PERSON" + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "shuai bai is the first author listed before et al", + "source_ids": [ + 194 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: shuai bai\nType: PERSON" + }, + { + "src_entity_name": "keqin chen", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "keqin chen is listed as an author before et al", + "source_ids": [ + 194 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: keqin chen\nType: PERSON" + }, + { + "src_entity_name": "xuejing liu", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "xuejing liu is listed as an author before et al", + "source_ids": [ + 194 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: xuejing liu\nType: PERSON" + }, + { + "src_entity_name": "jialin wang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "jialin wang is listed as an author before et al", + "source_ids": [ + 194 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: jialin wang\nType: PERSON" + }, + { + "src_entity_name": "wenbin ge", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "wenbin ge is listed as an author before et al", + "source_ids": [ + 194 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: wenbin ge\nType: PERSON" + }, + { + "src_entity_name": "sibo song", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "sibo song is listed as an author before et al", + "source_ids": [ + 194 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: sibo song\nType: PERSON" + }, + { + "src_entity_name": "kai dang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "kai dang is listed as an author before et al", + "source_ids": [ + 194 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: kai dang\nType: PERSON" + }, + { + "src_entity_name": "peng wang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "peng wang is listed as an author before et al", + "source_ids": [ + 194 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: peng wang\nType: PERSON" + }, + { + "src_entity_name": "shijie wang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "shijie wang is listed as an author before et al", + "source_ids": [ + 194 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: shijie wang\nType: PERSON" + }, + { + "src_entity_name": "jun tang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "jun tang is listed as an author before et al", + "source_ids": [ + 194 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: jun tang\nType: PERSON" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 9.0, + "description": "gheorghe comanici is listed before et al indicating they are among the authors represented by the abbreviation", + "source_ids": [ + 203 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: gheorghe comanici\nType: PERSON" + }, + { + "src_entity_name": "soyeong jeong", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 9.0, + "description": "soyeong jeong is listed alongside et al as authors of the paper", + "source_ids": [ + 213 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: soyeong jeong\nType: PERSON" + }, + { + "src_entity_name": "jinheon baek", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 9.0, + "description": "jinheon baek is listed alongside et al as authors of the paper", + "source_ids": [ + 213 + ], + "source": "Name: et al\nType: PERSON", + "target": "Name: jinheon baek\nType: PERSON" + }, + { + "src_entity_name": "avirup sil", + "tgt_entity_name": "hannaneh hajishirzi", + "relation_name": "", + "weight": 8.0, + "description": "avirup sil and hannaneh hajishirzi are co authors on the same paper", + "source_ids": [ + 193 + ], + "source": "Name: avirup sil\nType: PERSON", + "target": "Name: hannaneh hajishirzi\nType: PERSON" + }, + { + "src_entity_name": "arxiv preprint arxiv 2310 11511", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the preprint is hosted by the arxiv organization", + "source_ids": [ + 193 + ], + "source": "Name: arxiv preprint arxiv 2310 11511\nType: PUBLICATION_VENUE", + "target": "Name: arxiv\nType: ORGANIZATION" + }, + { + "src_entity_name": "arxiv preprint arxiv 2302 09051", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the preprint is hosted by the arxiv organization", + "source_ids": [ + 205 + ], + "source": "Name: arxiv\nType: ORGANIZATION", + "target": "Name: arxiv preprint arxiv 2302 09051\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "arxiv preprint arxiv 2312 10997", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 10.0, + "description": "the preprint is hosted by arxiv", + "source_ids": [ + 207 + ], + "source": "Name: arxiv\nType: ORGANIZATION", + "target": "Name: arxiv preprint arxiv 2312 10997\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "shuai bai is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "source": "Name: shuai bai\nType: PERSON", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "keqin chen", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and keqin chen are co authors of the same document", + "source_ids": [ + 194 + ], + "source": "Name: shuai bai\nType: PERSON", + "target": "Name: keqin chen\nType: PERSON" + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "xuejing liu", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and xuejing liu are co authors of the same document", + "source_ids": [ + 194 + ], + "source": "Name: shuai bai\nType: PERSON", + "target": "Name: xuejing liu\nType: PERSON" + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "jialin wang", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and jialin wang are co authors of the same document", + "source_ids": [ + 194 + ], + "source": "Name: shuai bai\nType: PERSON", + "target": "Name: jialin wang\nType: PERSON" + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "wenbin ge", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and wenbin ge are co authors of the same document", + "source_ids": [ + 194 + ], + "source": "Name: shuai bai\nType: PERSON", + "target": "Name: wenbin ge\nType: PERSON" + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "sibo song", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and sibo song are co authors of the same document", + "source_ids": [ + 194 + ], + "source": "Name: shuai bai\nType: PERSON", + "target": "Name: sibo song\nType: PERSON" + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "kai dang", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and kai dang are co authors of the same document", + "source_ids": [ + 194 + ], + "source": "Name: shuai bai\nType: PERSON", + "target": "Name: kai dang\nType: PERSON" + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "peng wang", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and peng wang are co authors of the same document", + "source_ids": [ + 194 + ], + "source": "Name: shuai bai\nType: PERSON", + "target": "Name: peng wang\nType: PERSON" + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "shijie wang", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and shijie wang are co authors of the same document", + "source_ids": [ + 194 + ], + "source": "Name: shuai bai\nType: PERSON", + "target": "Name: shijie wang\nType: PERSON" + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "jun tang", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and jun tang are co authors of the same document", + "source_ids": [ + 194 + ], + "source": "Name: shuai bai\nType: PERSON", + "target": "Name: jun tang\nType: PERSON" + }, + { + "src_entity_name": "keqin chen", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "keqin chen is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "source": "Name: keqin chen\nType: PERSON", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "xuejing liu", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "xuejing liu is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "source": "Name: xuejing liu\nType: PERSON", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "jialin wang", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "jialin wang is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "source": "Name: jialin wang\nType: PERSON", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "wenbin ge", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "wenbin ge is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "source": "Name: wenbin ge\nType: PERSON", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "sibo song", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "sibo song is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "source": "Name: sibo song\nType: PERSON", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "kai dang", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "kai dang is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "source": "Name: kai dang\nType: PERSON", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "peng wang", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "peng wang is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "source": "Name: peng wang\nType: PERSON", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "shijie wang", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "shijie wang is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "source": "Name: shijie wang\nType: PERSON", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "jun tang", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "jun tang is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ], + "source": "Name: jun tang\nType: PERSON", + "target": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "qwen2 5 vl technical report", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the qwen2 5 vl technical report was published as a preprint on arxiv", + "source_ids": [ + 194 + ], + "source": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE", + "target": "Name: arxiv\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "qwen2 5 vl technical report", + "tgt_entity_name": "arxiv 2502 13923", + "relation_name": "", + "weight": 9.0, + "description": "the qwen2 5 vl technical report is identified by the preprint number arxiv 2502 13923", + "source_ids": [ + 194 + ], + "source": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE", + "target": "Name: arxiv 2502 13923\nType: FILE_TYPE" + }, + { + "src_entity_name": "qwen2 5 vl technical report", + "tgt_entity_name": "qwen2 5 vl", + "relation_name": "", + "weight": 10.0, + "description": "the report is about the qwen2 5 vl model architecture", + "source_ids": [ + 194 + ], + "source": "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE", + "target": "Name: qwen2 5 vl\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "arxiv", + "tgt_entity_name": "preprint", + "relation_name": "", + "weight": 8.0, + "description": "arxiv is a platform for preprints", + "source_ids": [ + 194 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: preprint\nType: FILE_TYPE" + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper was published as a preprint on arxiv", + "source_ids": [ + 195 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK" + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "m3docrag was published as a preprint on arxiv", + "source_ids": [ + 201 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: m3docrag\nType: PRODUCT" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici is an author of the paper published on arxiv", + "source_ids": [ + 203 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: gheorghe comanici\nType: PERSON" + }, + { + "src_entity_name": "arxiv 2507 06261", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 10.0, + "description": "arxiv 2507 06261 is the specific identifier for the paper on arxiv", + "source_ids": [ + 203 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: arxiv 2507 06261\nType: FILE_TYPE" + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "darren edge is an author of a document published in arxiv", + "source_ids": [ + 206 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: darren edge\nType: PERSON" + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "ha trinh is an author of a document published in arxiv", + "source_ids": [ + 206 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: ha trinh\nType: PERSON" + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "newman cheng is an author of a document published in arxiv", + "source_ids": [ + 206 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: newman cheng\nType: PERSON" + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "joshua bradley is an author of a document published in arxiv", + "source_ids": [ + 206 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: joshua bradley\nType: PERSON" + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "alex chao is an author of a document published in arxiv", + "source_ids": [ + 206 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: alex chao\nType: PERSON" + }, + { + "src_entity_name": "apurva mody", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "apurva mody is an author of a document published in arxiv", + "source_ids": [ + 206 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: apurva mody\nType: PERSON" + }, + { + "src_entity_name": "steven truitt", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "steven truitt is an author of a document published in arxiv", + "source_ids": [ + 206 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: steven truitt\nType: PERSON" + }, + { + "src_entity_name": "jonathan larson", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "jonathan larson is an author of a document published in arxiv", + "source_ids": [ + 206 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: jonathan larson\nType: PERSON" + }, + { + "src_entity_name": "from local to global a graph rag approach to query focused summarization", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 10.0, + "description": "the document from local to global a graph rag approach to query focused summarization is published in arxiv", + "source_ids": [ + 206 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK" + }, + { + "src_entity_name": "hipporag", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 10.0, + "description": "the paper describing hipporag was published as a preprint on arxiv", + "source_ids": [ + 209 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: hipporag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "g retriever is published on the arxiv platform", + "source_ids": [ + 211 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper was published on the arxiv preprint server", + "source_ids": [ + 212 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: rag and rau a survey on retrieval augmented language model in natural language processing\nType: BOOK" + }, + { + "src_entity_name": "soyeong jeong", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "soyeong jeong s work was published on arxiv", + "source_ids": [ + 213 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: soyeong jeong\nType: PERSON" + }, + { + "src_entity_name": "jinheon baek", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "jinheon baek s work was published on arxiv", + "source_ids": [ + 213 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: jinheon baek\nType: PERSON" + }, + { + "src_entity_name": "adaptive rag", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the adaptive rag preprint is hosted on arxiv", + "source_ids": [ + 213 + ], + "source": "Name: arxiv\nType: PUBLICATION_VENUE", + "target": "Name: adaptive rag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "arxiv 2501 02235", + "tgt_entity_name": "preprint", + "relation_name": "", + "weight": 9.0, + "description": "arxiv 2501 02235 is identified as a preprint document", + "source_ids": [ + 195 + ], + "source": "Name: preprint\nType: FILE_TYPE", + "target": "Name: arxiv 2501 02235\nType: FILE_TYPE" + }, + { + "src_entity_name": "camille barboule", + "tgt_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "relation_name": "", + "weight": 10.0, + "description": "camille barboule is an author of the survey paper", + "source_ids": [ + 195 + ], + "source": "Name: camille barboule\nType: PERSON", + "target": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK" + }, + { + "src_entity_name": "camille barboule", + "tgt_entity_name": "benjamin piwowarski", + "relation_name": "", + "weight": 8.0, + "description": "camille barboule and benjamin piwowarski are co authors of the same survey paper", + "source_ids": [ + 195 + ], + "source": "Name: camille barboule\nType: PERSON", + "target": "Name: benjamin piwowarski\nType: PERSON" + }, + { + "src_entity_name": "camille barboule", + "tgt_entity_name": "yoan chabot", + "relation_name": "", + "weight": 8.0, + "description": "camille barboule and yoan chabot are co authors of the same survey paper", + "source_ids": [ + 195 + ], + "source": "Name: camille barboule\nType: PERSON", + "target": "Name: yoan chabot\nType: PERSON" + }, + { + "src_entity_name": "benjamin piwowarski", + "tgt_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "relation_name": "", + "weight": 10.0, + "description": "benjamin piwowarski is an author of the survey paper", + "source_ids": [ + 195 + ], + "source": "Name: benjamin piwowarski\nType: PERSON", + "target": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK" + }, + { + "src_entity_name": "benjamin piwowarski", + "tgt_entity_name": "yoan chabot", + "relation_name": "", + "weight": 8.0, + "description": "benjamin piwowarski and yoan chabot are co authors of the same survey paper", + "source_ids": [ + 195 + ], + "source": "Name: benjamin piwowarski\nType: PERSON", + "target": "Name: yoan chabot\nType: PERSON" + }, + { + "src_entity_name": "yoan chabot", + "tgt_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "relation_name": "", + "weight": 10.0, + "description": "yoan chabot is an author of the survey paper", + "source_ids": [ + 195 + ], + "source": "Name: yoan chabot\nType: PERSON", + "target": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK" + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "arxiv 2501 02235", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper is identified by the preprint number arxiv 2501 02235", + "source_ids": [ + 195 + ], + "source": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK", + "target": "Name: arxiv 2501 02235\nType: FILE_TYPE" + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "visually rich documents", + "relation_name": "", + "weight": 9.0, + "description": "the survey specifically addresses visually rich documents", + "source_ids": [ + 195 + ], + "source": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK", + "target": "Name: visually rich documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "methods", + "relation_name": "", + "weight": 8.0, + "description": "the survey covers various methods used in the field", + "source_ids": [ + 195 + ], + "source": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK", + "target": "Name: methods\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "challenges", + "relation_name": "", + "weight": 8.0, + "description": "the survey discusses the challenges present in the field", + "source_ids": [ + 195 + ], + "source": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK", + "target": "Name: challenges\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "trends", + "relation_name": "", + "weight": 8.0, + "description": "the survey outlines the trends in the research area", + "source_ids": [ + 195 + ], + "source": "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK", + "target": "Name: trends\nType: RESEARCH_FIELD" + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "yukun cao is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ], + "source": "Name: yukun cao\nType: PERSON", + "target": "Name: lego graphrag\nType: PRODUCT" + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "yukun cao is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ], + "source": "Name: yukun cao\nType: PERSON", + "target": "Name: proc vldb endow\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "zengyi gao", + "relation_name": "", + "weight": 8.0, + "description": "yukun cao and zengyi gao are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: yukun cao\nType: PERSON", + "target": "Name: zengyi gao\nType: PERSON" + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "zhiyang li", + "relation_name": "", + "weight": 8.0, + "description": "yukun cao and zhiyang li are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: yukun cao\nType: PERSON", + "target": "Name: zhiyang li\nType: PERSON" + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "xike xie", + "relation_name": "", + "weight": 8.0, + "description": "yukun cao and xike xie are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: yukun cao\nType: PERSON", + "target": "Name: xike xie\nType: PERSON" + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "s kevin zhou", + "relation_name": "", + "weight": 8.0, + "description": "yukun cao and s kevin zhou are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: yukun cao\nType: PERSON", + "target": "Name: s kevin zhou\nType: PERSON" + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "jianliang xu", + "relation_name": "", + "weight": 8.0, + "description": "yukun cao and jianliang xu are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: yukun cao\nType: PERSON", + "target": "Name: jianliang xu\nType: PERSON" + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "zengyi gao is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ], + "source": "Name: zengyi gao\nType: PERSON", + "target": "Name: lego graphrag\nType: PRODUCT" + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "zengyi gao is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ], + "source": "Name: zengyi gao\nType: PERSON", + "target": "Name: proc vldb endow\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "zhiyang li", + "relation_name": "", + "weight": 8.0, + "description": "zengyi gao and zhiyang li are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: zengyi gao\nType: PERSON", + "target": "Name: zhiyang li\nType: PERSON" + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "xike xie", + "relation_name": "", + "weight": 8.0, + "description": "zengyi gao and xike xie are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: zengyi gao\nType: PERSON", + "target": "Name: xike xie\nType: PERSON" + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "s kevin zhou", + "relation_name": "", + "weight": 8.0, + "description": "zengyi gao and s kevin zhou are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: zengyi gao\nType: PERSON", + "target": "Name: s kevin zhou\nType: PERSON" + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "jianliang xu", + "relation_name": "", + "weight": 8.0, + "description": "zengyi gao and jianliang xu are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: zengyi gao\nType: PERSON", + "target": "Name: jianliang xu\nType: PERSON" + }, + { + "src_entity_name": "zhiyang li", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "zhiyang li is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ], + "source": "Name: zhiyang li\nType: PERSON", + "target": "Name: lego graphrag\nType: PRODUCT" + }, + { + "src_entity_name": "zhiyang li", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "zhiyang li is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ], + "source": "Name: zhiyang li\nType: PERSON", + "target": "Name: proc vldb endow\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "zhiyang li", + "tgt_entity_name": "xike xie", + "relation_name": "", + "weight": 8.0, + "description": "zhiyang li and xike xie are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: zhiyang li\nType: PERSON", + "target": "Name: xike xie\nType: PERSON" + }, + { + "src_entity_name": "zhiyang li", + "tgt_entity_name": "s kevin zhou", + "relation_name": "", + "weight": 8.0, + "description": "zhiyang li and s kevin zhou are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: zhiyang li\nType: PERSON", + "target": "Name: s kevin zhou\nType: PERSON" + }, + { + "src_entity_name": "zhiyang li", + "tgt_entity_name": "jianliang xu", + "relation_name": "", + "weight": 8.0, + "description": "zhiyang li and jianliang xu are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: zhiyang li\nType: PERSON", + "target": "Name: jianliang xu\nType: PERSON" + }, + { + "src_entity_name": "xike xie", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "xike xie is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ], + "source": "Name: xike xie\nType: PERSON", + "target": "Name: lego graphrag\nType: PRODUCT" + }, + { + "src_entity_name": "xike xie", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "xike xie is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ], + "source": "Name: xike xie\nType: PERSON", + "target": "Name: proc vldb endow\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "xike xie", + "tgt_entity_name": "s kevin zhou", + "relation_name": "", + "weight": 8.0, + "description": "xike xie and s kevin zhou are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: xike xie\nType: PERSON", + "target": "Name: s kevin zhou\nType: PERSON" + }, + { + "src_entity_name": "xike xie", + "tgt_entity_name": "jianliang xu", + "relation_name": "", + "weight": 8.0, + "description": "xike xie and jianliang xu are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: xike xie\nType: PERSON", + "target": "Name: jianliang xu\nType: PERSON" + }, + { + "src_entity_name": "s kevin zhou", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "s kevin zhou is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ], + "source": "Name: s kevin zhou\nType: PERSON", + "target": "Name: lego graphrag\nType: PRODUCT" + }, + { + "src_entity_name": "s kevin zhou", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "s kevin zhou is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ], + "source": "Name: s kevin zhou\nType: PERSON", + "target": "Name: proc vldb endow\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "s kevin zhou", + "tgt_entity_name": "jianliang xu", + "relation_name": "", + "weight": 8.0, + "description": "s kevin zhou and jianliang xu are co authors on the same paper", + "source_ids": [ + 196 + ], + "source": "Name: s kevin zhou\nType: PERSON", + "target": "Name: jianliang xu\nType: PERSON" + }, + { + "src_entity_name": "jianliang xu", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "jianliang xu is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ], + "source": "Name: jianliang xu\nType: PERSON", + "target": "Name: lego graphrag\nType: PRODUCT" + }, + { + "src_entity_name": "jianliang xu", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "jianliang xu is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ], + "source": "Name: jianliang xu\nType: PERSON", + "target": "Name: proc vldb endow\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 10.0, + "description": "lego graphrag is the subject of a paper published in proc vldb endow", + "source_ids": [ + 196 + ], + "source": "Name: lego graphrag\nType: PRODUCT", + "target": "Name: proc vldb endow\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "june 2025", + "relation_name": "", + "weight": 9.0, + "description": "lego graphrag was published in june 2025", + "source_ids": [ + 196 + ], + "source": "Name: lego graphrag\nType: PRODUCT", + "target": "Name: june 2025\nType: DATE" + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "design space exploration", + "relation_name": "", + "weight": 10.0, + "description": "lego graphrag is developed specifically for design space exploration", + "source_ids": [ + 196 + ], + "source": "Name: lego graphrag\nType: PRODUCT", + "target": "Name: design space exploration\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "graph based retrieval augmented generation", + "relation_name": "", + "weight": 10.0, + "description": "lego graphrag is a modularized version of graph based retrieval augmented generation", + "source_ids": [ + 196 + ], + "source": "Name: lego graphrag\nType: PRODUCT", + "target": "Name: graph based retrieval augmented generation\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "modularizing", + "relation_name": "", + "weight": 9.0, + "description": "the paper describes the process of modularizing graph based retrieval augmented generation to create lego graphrag", + "source_ids": [ + 196 + ], + "source": "Name: lego graphrag\nType: PRODUCT", + "target": "Name: modularizing\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "https doi org 10 14778 3748191 3748194", + "relation_name": "", + "weight": 10.0, + "description": "the paper describing lego graphrag is accessible via the provided doi link", + "source_ids": [ + 196 + ], + "source": "Name: lego graphrag\nType: PRODUCT", + "target": "Name: https doi org 10 14778 3748191 3748194\nType: URL" + }, + { + "src_entity_name": "proc vldb endow", + "tgt_entity_name": "june 2025", + "relation_name": "", + "weight": 9.0, + "description": "proc vldb endow published the paper in june 2025", + "source_ids": [ + 196 + ], + "source": "Name: proc vldb endow\nType: PUBLICATION_VENUE", + "target": "Name: june 2025\nType: DATE" + }, + { + "src_entity_name": "proc vldb endow", + "tgt_entity_name": "18", + "relation_name": "", + "weight": 8.0, + "description": "the paper was published in volume 18 of proc vldb endow", + "source_ids": [ + 196 + ], + "source": "Name: proc vldb endow\nType: PUBLICATION_VENUE", + "target": "Name: 18\nType: MEASUREMENT" + }, + { + "src_entity_name": "proc vldb endow", + "tgt_entity_name": "3269 3283", + "relation_name": "", + "weight": 8.0, + "description": "the paper appears on pages 3269 3283 of proc vldb endow", + "source_ids": [ + 196 + ], + "source": "Name: proc vldb endow\nType: PUBLICATION_VENUE", + "target": "Name: 3269 3283\nType: MEASUREMENT" + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "chengliang chai is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ], + "source": "Name: chengliang chai\nType: PERSON", + "target": "Name: doctopus\nType: PRODUCT" + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "chengliang chai is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ], + "source": "Name: chengliang chai\nType: PERSON", + "target": "Name: budget aware structural table extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "chengliang chai is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ], + "source": "Name: chengliang chai\nType: PERSON", + "target": "Name: unstructured documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "chengliang chai is an author of the paper describing haipipe", + "source_ids": [ + 200 + ], + "source": "Name: chengliang chai\nType: PERSON", + "target": "Name: haipipe\nType: PRODUCT" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "chengliang chai", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and chengliang chai are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: chengliang chai\nType: PERSON", + "target": "Name: sibei chen\nType: PERSON" + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "chengliang chai", + "relation_name": "", + "weight": 8.0, + "description": "nan tang and chengliang chai are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: chengliang chai\nType: PERSON", + "target": "Name: nan tang\nType: PERSON" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "chengliang chai", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and chengliang chai are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: chengliang chai\nType: PERSON", + "target": "Name: ju fan\nType: PERSON" + }, + { + "src_entity_name": "xuemi yan", + "tgt_entity_name": "chengliang chai", + "relation_name": "", + "weight": 8.0, + "description": "xuemi yan and chengliang chai are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: chengliang chai\nType: PERSON", + "target": "Name: xuemi yan\nType: PERSON" + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "guoliang li", + "relation_name": "", + "weight": 8.0, + "description": "chengliang chai and guoliang li are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: chengliang chai\nType: PERSON", + "target": "Name: guoliang li\nType: PERSON" + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "chengliang chai and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: chengliang chai\nType: PERSON", + "target": "Name: xiaoyong du\nType: PERSON" + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "chengliang chai is an author of a paper published by the acm", + "source_ids": [ + 200 + ], + "source": "Name: chengliang chai\nType: PERSON", + "target": "Name: acm\nType: ORGANIZATION" + }, + { + "src_entity_name": "jiajun li", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "jiajun li is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ], + "source": "Name: jiajun li\nType: PERSON", + "target": "Name: doctopus\nType: PRODUCT" + }, + { + "src_entity_name": "jiajun li", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "jiajun li is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ], + "source": "Name: jiajun li\nType: PERSON", + "target": "Name: budget aware structural table extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "jiajun li", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "jiajun li is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ], + "source": "Name: jiajun li\nType: PERSON", + "target": "Name: unstructured documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "yuhao deng", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "yuhao deng is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ], + "source": "Name: yuhao deng\nType: PERSON", + "target": "Name: doctopus\nType: PRODUCT" + }, + { + "src_entity_name": "yuhao deng", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "yuhao deng is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ], + "source": "Name: yuhao deng\nType: PERSON", + "target": "Name: budget aware structural table extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "yuhao deng", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "yuhao deng is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ], + "source": "Name: yuhao deng\nType: PERSON", + "target": "Name: unstructured documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "yuanhao zhong", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "yuanhao zhong is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ], + "source": "Name: yuanhao zhong\nType: PERSON", + "target": "Name: doctopus\nType: PRODUCT" + }, + { + "src_entity_name": "yuanhao zhong", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "yuanhao zhong is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ], + "source": "Name: yuanhao zhong\nType: PERSON", + "target": "Name: budget aware structural table extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "yuanhao zhong", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "yuanhao zhong is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ], + "source": "Name: yuanhao zhong\nType: PERSON", + "target": "Name: unstructured documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "ye yuan", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "ye yuan is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ], + "source": "Name: ye yuan\nType: PERSON", + "target": "Name: doctopus\nType: PRODUCT" + }, + { + "src_entity_name": "ye yuan", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "ye yuan is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ], + "source": "Name: ye yuan\nType: PERSON", + "target": "Name: budget aware structural table extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "ye yuan", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "ye yuan is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ], + "source": "Name: ye yuan\nType: PERSON", + "target": "Name: unstructured documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "guoren wang", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "guoren wang is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ], + "source": "Name: guoren wang\nType: PERSON", + "target": "Name: doctopus\nType: PRODUCT" + }, + { + "src_entity_name": "guoren wang", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "guoren wang is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ], + "source": "Name: guoren wang\nType: PERSON", + "target": "Name: budget aware structural table extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "guoren wang", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "guoren wang is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ], + "source": "Name: guoren wang\nType: PERSON", + "target": "Name: unstructured documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "lei cao", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "lei cao is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ], + "source": "Name: lei cao\nType: PERSON", + "target": "Name: doctopus\nType: PRODUCT" + }, + { + "src_entity_name": "lei cao", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "lei cao is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ], + "source": "Name: lei cao\nType: PERSON", + "target": "Name: budget aware structural table extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "lei cao", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "lei cao is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ], + "source": "Name: lei cao\nType: PERSON", + "target": "Name: unstructured documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "doctopus", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 10.0, + "description": "doctopus is the system designed to perform budget aware structural table extraction", + "source_ids": [ + 197 + ], + "source": "Name: doctopus\nType: PRODUCT", + "target": "Name: budget aware structural table extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "doctopus", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 10.0, + "description": "doctopus processes unstructured documents to extract structural tables", + "source_ids": [ + 197 + ], + "source": "Name: doctopus\nType: PRODUCT", + "target": "Name: unstructured documents\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "ilias chalkidis", + "tgt_entity_name": "legal bert", + "relation_name": "", + "weight": 9.0, + "description": "ilias chalkidis is an author of the paper introducing the legal bert model", + "source_ids": [ + 198 + ], + "source": "Name: ilias chalkidis\nType: PERSON", + "target": "Name: legal bert\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "ilias chalkidis", + "tgt_entity_name": "manos fergadiotis", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ], + "source": "Name: ilias chalkidis\nType: PERSON", + "target": "Name: manos fergadiotis\nType: PERSON" + }, + { + "src_entity_name": "ilias chalkidis", + "tgt_entity_name": "prodromos malakasiotis", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ], + "source": "Name: ilias chalkidis\nType: PERSON", + "target": "Name: prodromos malakasiotis\nType: PERSON" + }, + { + "src_entity_name": "ilias chalkidis", + "tgt_entity_name": "nikolaos aletras", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ], + "source": "Name: ilias chalkidis\nType: PERSON", + "target": "Name: nikolaos aletras\nType: PERSON" + }, + { + "src_entity_name": "ilias chalkidis", + "tgt_entity_name": "ion androutsopoulos", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ], + "source": "Name: ilias chalkidis\nType: PERSON", + "target": "Name: ion androutsopoulos\nType: PERSON" + }, + { + "src_entity_name": "manos fergadiotis", + "tgt_entity_name": "legal bert", + "relation_name": "", + "weight": 9.0, + "description": "manos fergadiotis is an author of the paper introducing the legal bert model", + "source_ids": [ + 198 + ], + "source": "Name: manos fergadiotis\nType: PERSON", + "target": "Name: legal bert\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "manos fergadiotis", + "tgt_entity_name": "prodromos malakasiotis", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ], + "source": "Name: manos fergadiotis\nType: PERSON", + "target": "Name: prodromos malakasiotis\nType: PERSON" + }, + { + "src_entity_name": "manos fergadiotis", + "tgt_entity_name": "nikolaos aletras", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ], + "source": "Name: manos fergadiotis\nType: PERSON", + "target": "Name: nikolaos aletras\nType: PERSON" + }, + { + "src_entity_name": "manos fergadiotis", + "tgt_entity_name": "ion androutsopoulos", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ], + "source": "Name: manos fergadiotis\nType: PERSON", + "target": "Name: ion androutsopoulos\nType: PERSON" + }, + { + "src_entity_name": "prodromos malakasiotis", + "tgt_entity_name": "legal bert", + "relation_name": "", + "weight": 9.0, + "description": "prodromos malakasiotis is an author of the paper introducing the legal bert model", + "source_ids": [ + 198 + ], + "source": "Name: prodromos malakasiotis\nType: PERSON", + "target": "Name: legal bert\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "prodromos malakasiotis", + "tgt_entity_name": "nikolaos aletras", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ], + "source": "Name: prodromos malakasiotis\nType: PERSON", + "target": "Name: nikolaos aletras\nType: PERSON" + }, + { + "src_entity_name": "prodromos malakasiotis", + "tgt_entity_name": "ion androutsopoulos", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ], + "source": "Name: prodromos malakasiotis\nType: PERSON", + "target": "Name: ion androutsopoulos\nType: PERSON" + }, + { + "src_entity_name": "nikolaos aletras", + "tgt_entity_name": "legal bert", + "relation_name": "", + "weight": 9.0, + "description": "nikolaos aletras is an author of the paper introducing the legal bert model", + "source_ids": [ + 198 + ], + "source": "Name: nikolaos aletras\nType: PERSON", + "target": "Name: legal bert\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "nikolaos aletras", + "tgt_entity_name": "ion androutsopoulos", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ], + "source": "Name: nikolaos aletras\nType: PERSON", + "target": "Name: ion androutsopoulos\nType: PERSON" + }, + { + "src_entity_name": "ion androutsopoulos", + "tgt_entity_name": "legal bert", + "relation_name": "", + "weight": 9.0, + "description": "ion androutsopoulos is an author of the paper introducing the legal bert model", + "source_ids": [ + 198 + ], + "source": "Name: ion androutsopoulos\nType: PERSON", + "target": "Name: legal bert\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "legal bert", + "tgt_entity_name": "arxiv preprint arxiv 2010 02559", + "relation_name": "", + "weight": 10.0, + "description": "legal bert is the subject of the publication arxiv preprint arxiv 2010 02559", + "source_ids": [ + 198 + ], + "source": "Name: legal bert\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: arxiv preprint arxiv 2010 02559\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "legal bert", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "legal bert was published in the year 2020", + "source_ids": [ + 198 + ], + "source": "Name: legal bert\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: 2020\nType: DATE" + }, + { + "src_entity_name": "legal bert", + "tgt_entity_name": "muppets", + "relation_name": "", + "weight": 9.0, + "description": "legal bert is described as being straight out of the muppets in the text", + "source_ids": [ + 198 + ], + "source": "Name: legal bert\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: muppets\nType: PRODUCT" + }, + { + "src_entity_name": "legal bert", + "tgt_entity_name": "law school", + "relation_name": "", + "weight": 9.0, + "description": "legal bert is described as coming straight out of law school in the text", + "source_ids": [ + 198 + ], + "source": "Name: legal bert\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: law school\nType: LOCATION" + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "vassilis christophides is an author of a paper published in 2020", + "source_ids": [ + 202 + ], + "source": "Name: 2020\nType: DATE", + "target": "Name: vassilis christophides\nType: PERSON" + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "vasilis efthymiou is an author of a paper published in 2020", + "source_ids": [ + 202 + ], + "source": "Name: 2020\nType: DATE", + "target": "Name: vasilis efthymiou\nType: PERSON" + }, + { + "src_entity_name": "themis palpanas", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "themis palpanas is an author of a paper published in 2020", + "source_ids": [ + 202 + ], + "source": "Name: 2020\nType: DATE", + "target": "Name: themis palpanas\nType: PERSON" + }, + { + "src_entity_name": "george papadakis", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "george papadakis is an author of a paper published in 2020", + "source_ids": [ + 202 + ], + "source": "Name: 2020\nType: DATE", + "target": "Name: george papadakis\nType: PERSON" + }, + { + "src_entity_name": "kostas stefanidis", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "kostas stefanidis is an author of a paper published in 2020", + "source_ids": [ + 202 + ], + "source": "Name: 2020\nType: DATE", + "target": "Name: kostas stefanidis\nType: PERSON" + }, + { + "src_entity_name": "2020", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 9.0, + "description": "the paper was published in acm computing surveys in the year 2020", + "source_ids": [ + 202 + ], + "source": "Name: 2020\nType: DATE", + "target": "Name: acm computing surveys\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "2020", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "the paper titled an overview of end to end entity resolution for big data was published in 2020", + "source_ids": [ + 202 + ], + "source": "Name: 2020\nType: DATE", + "target": "Name: an overview of end to end entity resolution for big data\nType: BOOK" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "sibei chen is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: auto formula\nType: PRODUCT" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen is an author of a paper published in this venue", + "source_ids": [ + 199 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "yeye he", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and yeye he are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: yeye he\nType: PERSON" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "weiwei cui", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and weiwei cui are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: weiwei cui\nType: PERSON" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "ju fan", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and ju fan are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: ju fan\nType: PERSON" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "song ge", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and song ge are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: song ge\nType: PERSON" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "haidong zhang", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and haidong zhang are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: haidong zhang\nType: PERSON" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: dongmei zhang\nType: PERSON" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: surajit chaudhuri\nType: PERSON" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "sibei chen is an author of the paper describing haipipe", + "source_ids": [ + 200 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: haipipe\nType: PRODUCT" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "nan tang", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and nan tang are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: nan tang\nType: PERSON" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "xuemi yan", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and xuemi yan are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: xuemi yan\nType: PERSON" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "guoliang li", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and guoliang li are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: guoliang li\nType: PERSON" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: xiaoyong du\nType: PERSON" + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "sibei chen is an author of a paper published by the acm", + "source_ids": [ + 200 + ], + "source": "Name: sibei chen\nType: PERSON", + "target": "Name: acm\nType: ORGANIZATION" + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "yeye he is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ], + "source": "Name: yeye he\nType: PERSON", + "target": "Name: auto formula\nType: PRODUCT" + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "yeye he is an author of a paper published in this venue", + "source_ids": [ + 199 + ], + "source": "Name: yeye he\nType: PERSON", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "weiwei cui", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and weiwei cui are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: yeye he\nType: PERSON", + "target": "Name: weiwei cui\nType: PERSON" + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "ju fan", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and ju fan are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: yeye he\nType: PERSON", + "target": "Name: ju fan\nType: PERSON" + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "song ge", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and song ge are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: yeye he\nType: PERSON", + "target": "Name: song ge\nType: PERSON" + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "haidong zhang", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and haidong zhang are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: yeye he\nType: PERSON", + "target": "Name: haidong zhang\nType: PERSON" + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: yeye he\nType: PERSON", + "target": "Name: dongmei zhang\nType: PERSON" + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: yeye he\nType: PERSON", + "target": "Name: surajit chaudhuri\nType: PERSON" + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "weiwei cui is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ], + "source": "Name: weiwei cui\nType: PERSON", + "target": "Name: auto formula\nType: PRODUCT" + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui is an author of a paper published in this venue", + "source_ids": [ + 199 + ], + "source": "Name: weiwei cui\nType: PERSON", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "ju fan", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui and ju fan are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: weiwei cui\nType: PERSON", + "target": "Name: ju fan\nType: PERSON" + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "song ge", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui and song ge are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: weiwei cui\nType: PERSON", + "target": "Name: song ge\nType: PERSON" + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "haidong zhang", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui and haidong zhang are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: weiwei cui\nType: PERSON", + "target": "Name: haidong zhang\nType: PERSON" + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: weiwei cui\nType: PERSON", + "target": "Name: dongmei zhang\nType: PERSON" + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: weiwei cui\nType: PERSON", + "target": "Name: surajit chaudhuri\nType: PERSON" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "ju fan is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: auto formula\nType: PRODUCT" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "ju fan is an author of a paper published in this venue", + "source_ids": [ + 199 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "song ge", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and song ge are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: song ge\nType: PERSON" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "haidong zhang", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and haidong zhang are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: haidong zhang\nType: PERSON" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: dongmei zhang\nType: PERSON" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: surajit chaudhuri\nType: PERSON" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "ju fan is an author of the paper describing haipipe", + "source_ids": [ + 200 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: haipipe\nType: PRODUCT" + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "ju fan", + "relation_name": "", + "weight": 8.0, + "description": "nan tang and ju fan are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: nan tang\nType: PERSON" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "xuemi yan", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and xuemi yan are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: xuemi yan\nType: PERSON" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "guoliang li", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and guoliang li are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: guoliang li\nType: PERSON" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: xiaoyong du\nType: PERSON" + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "ju fan is an author of a paper published by the acm", + "source_ids": [ + 200 + ], + "source": "Name: ju fan\nType: PERSON", + "target": "Name: acm\nType: ORGANIZATION" + }, + { + "src_entity_name": "song ge", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "song ge is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ], + "source": "Name: song ge\nType: PERSON", + "target": "Name: auto formula\nType: PRODUCT" + }, + { + "src_entity_name": "song ge", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "song ge is an author of a paper published in this venue", + "source_ids": [ + 199 + ], + "source": "Name: song ge\nType: PERSON", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "song ge", + "tgt_entity_name": "haidong zhang", + "relation_name": "", + "weight": 8.0, + "description": "song ge and haidong zhang are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: song ge\nType: PERSON", + "target": "Name: haidong zhang\nType: PERSON" + }, + { + "src_entity_name": "song ge", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "song ge and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: song ge\nType: PERSON", + "target": "Name: dongmei zhang\nType: PERSON" + }, + { + "src_entity_name": "song ge", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "song ge and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: song ge\nType: PERSON", + "target": "Name: surajit chaudhuri\nType: PERSON" + }, + { + "src_entity_name": "haidong zhang", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "haidong zhang is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ], + "source": "Name: haidong zhang\nType: PERSON", + "target": "Name: auto formula\nType: PRODUCT" + }, + { + "src_entity_name": "haidong zhang", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "haidong zhang is an author of a paper published in this venue", + "source_ids": [ + 199 + ], + "source": "Name: haidong zhang\nType: PERSON", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "haidong zhang", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "haidong zhang and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: haidong zhang\nType: PERSON", + "target": "Name: dongmei zhang\nType: PERSON" + }, + { + "src_entity_name": "haidong zhang", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "haidong zhang and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: haidong zhang\nType: PERSON", + "target": "Name: surajit chaudhuri\nType: PERSON" + }, + { + "src_entity_name": "dongmei zhang", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "dongmei zhang is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ], + "source": "Name: dongmei zhang\nType: PERSON", + "target": "Name: auto formula\nType: PRODUCT" + }, + { + "src_entity_name": "dongmei zhang", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "dongmei zhang is an author of a paper published in this venue", + "source_ids": [ + 199 + ], + "source": "Name: dongmei zhang\nType: PERSON", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "dongmei zhang", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "dongmei zhang and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ], + "source": "Name: dongmei zhang\nType: PERSON", + "target": "Name: surajit chaudhuri\nType: PERSON" + }, + { + "src_entity_name": "surajit chaudhuri", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "surajit chaudhuri is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ], + "source": "Name: surajit chaudhuri\nType: PERSON", + "target": "Name: auto formula\nType: PRODUCT" + }, + { + "src_entity_name": "surajit chaudhuri", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "surajit chaudhuri is an author of a paper published in this venue", + "source_ids": [ + 199 + ], + "source": "Name: surajit chaudhuri\nType: PERSON", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 9.0, + "description": "auto formula is the subject of a paper published in this venue", + "source_ids": [ + 199 + ], + "source": "Name: auto formula\nType: PRODUCT", + "target": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "contrastive learning", + "relation_name": "", + "weight": 10.0, + "description": "auto formula uses contrastive learning as its core method", + "source_ids": [ + 199 + ], + "source": "Name: auto formula\nType: PRODUCT", + "target": "Name: contrastive learning\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "spreadsheets", + "relation_name": "", + "weight": 9.0, + "description": "auto formula operates within the context of spreadsheets", + "source_ids": [ + 199 + ], + "source": "Name: auto formula\nType: PRODUCT", + "target": "Name: spreadsheets\nType: PRODUCT" + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "formulas", + "relation_name": "", + "weight": 10.0, + "description": "auto formula is designed to recommend formulas", + "source_ids": [ + 199 + ], + "source": "Name: auto formula\nType: PRODUCT", + "target": "Name: formulas\nType: PRODUCT" + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "table representations", + "relation_name": "", + "weight": 9.0, + "description": "auto formula relies on table representations for its learning process", + "source_ids": [ + 199 + ], + "source": "Name: auto formula\nType: PRODUCT", + "target": "Name: table representations\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "proceedings of the acm on management of data", + "tgt_entity_name": "1 27", + "relation_name": "", + "weight": 10.0, + "description": "the publication page range is 1 27", + "source_ids": [ + 199 + ], + "source": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE", + "target": "Name: 1 27\nType: MEASUREMENT" + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 10.0, + "description": "haipipe is published in the proceedings of the acm on management of data", + "source_ids": [ + 200 + ], + "source": "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE", + "target": "Name: haipipe\nType: PRODUCT" + }, + { + "src_entity_name": "contrastive learning", + "tgt_entity_name": "table representations", + "relation_name": "", + "weight": 9.0, + "description": "contrastive learning is applied to table representations", + "source_ids": [ + 199 + ], + "source": "Name: table representations\nType: DATASET_OR_CORPUS", + "target": "Name: contrastive learning\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "nan tang is an author of the paper describing haipipe", + "source_ids": [ + 200 + ], + "source": "Name: nan tang\nType: PERSON", + "target": "Name: haipipe\nType: PRODUCT" + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "xuemi yan", + "relation_name": "", + "weight": 8.0, + "description": "nan tang and xuemi yan are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: nan tang\nType: PERSON", + "target": "Name: xuemi yan\nType: PERSON" + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "guoliang li", + "relation_name": "", + "weight": 8.0, + "description": "nan tang and guoliang li are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: nan tang\nType: PERSON", + "target": "Name: guoliang li\nType: PERSON" + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "nan tang and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: nan tang\nType: PERSON", + "target": "Name: xiaoyong du\nType: PERSON" + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "nan tang is an author of a paper published by the acm", + "source_ids": [ + 200 + ], + "source": "Name: nan tang\nType: PERSON", + "target": "Name: acm\nType: ORGANIZATION" + }, + { + "src_entity_name": "xuemi yan", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "xuemi yan is an author of the paper describing haipipe", + "source_ids": [ + 200 + ], + "source": "Name: xuemi yan\nType: PERSON", + "target": "Name: haipipe\nType: PRODUCT" + }, + { + "src_entity_name": "xuemi yan", + "tgt_entity_name": "guoliang li", + "relation_name": "", + "weight": 8.0, + "description": "xuemi yan and guoliang li are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: xuemi yan\nType: PERSON", + "target": "Name: guoliang li\nType: PERSON" + }, + { + "src_entity_name": "xuemi yan", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "xuemi yan and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: xuemi yan\nType: PERSON", + "target": "Name: xiaoyong du\nType: PERSON" + }, + { + "src_entity_name": "xuemi yan", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "xuemi yan is an author of a paper published by the acm", + "source_ids": [ + 200 + ], + "source": "Name: xuemi yan\nType: PERSON", + "target": "Name: acm\nType: ORGANIZATION" + }, + { + "src_entity_name": "guoliang li", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "guoliang li is an author of the paper describing haipipe", + "source_ids": [ + 200 + ], + "source": "Name: guoliang li\nType: PERSON", + "target": "Name: haipipe\nType: PRODUCT" + }, + { + "src_entity_name": "guoliang li", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "guoliang li and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ], + "source": "Name: guoliang li\nType: PERSON", + "target": "Name: xiaoyong du\nType: PERSON" + }, + { + "src_entity_name": "guoliang li", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "guoliang li is an author of a paper published by the acm", + "source_ids": [ + 200 + ], + "source": "Name: guoliang li\nType: PERSON", + "target": "Name: acm\nType: ORGANIZATION" + }, + { + "src_entity_name": "xiaoyong du", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "xiaoyong du is an author of the paper describing haipipe", + "source_ids": [ + 200 + ], + "source": "Name: xiaoyong du\nType: PERSON", + "target": "Name: haipipe\nType: PRODUCT" + }, + { + "src_entity_name": "xiaoyong du", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "xiaoyong du is an author of a paper published by the acm", + "source_ids": [ + 200 + ], + "source": "Name: xiaoyong du\nType: PERSON", + "target": "Name: acm\nType: ORGANIZATION" + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "data preparation", + "relation_name": "", + "weight": 10.0, + "description": "haipipe is a system designed for data preparation", + "source_ids": [ + 200 + ], + "source": "Name: haipipe\nType: PRODUCT", + "target": "Name: data preparation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "human generated pipelines", + "relation_name": "", + "weight": 9.0, + "description": "haipipe combines human generated pipelines as part of its methodology", + "source_ids": [ + 200 + ], + "source": "Name: haipipe\nType: PRODUCT", + "target": "Name: human generated pipelines\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "machine generated pipelines", + "relation_name": "", + "weight": 9.0, + "description": "haipipe combines machine generated pipelines as part of its methodology", + "source_ids": [ + 200 + ], + "source": "Name: haipipe\nType: PRODUCT", + "target": "Name: machine generated pipelines\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 8.0, + "description": "haipipe is published by the acm organization", + "source_ids": [ + 200 + ], + "source": "Name: haipipe\nType: PRODUCT", + "target": "Name: acm\nType: ORGANIZATION" + }, + { + "src_entity_name": "jaemin cho", + "tgt_entity_name": "m3docrag", + "relation_name": "", + "weight": 10.0, + "description": "jaemin cho is an author of the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: jaemin cho\nType: PERSON", + "target": "Name: m3docrag\nType: PRODUCT" + }, + { + "src_entity_name": "jaemin cho", + "tgt_entity_name": "debanjan mahata", + "relation_name": "", + "weight": 8.0, + "description": "jaemin cho and debanjan mahata are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: jaemin cho\nType: PERSON", + "target": "Name: debanjan mahata\nType: PERSON" + }, + { + "src_entity_name": "jaemin cho", + "tgt_entity_name": "ozan irsoy", + "relation_name": "", + "weight": 8.0, + "description": "jaemin cho and ozan irsoy are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: jaemin cho\nType: PERSON", + "target": "Name: ozan irsoy\nType: PERSON" + }, + { + "src_entity_name": "jaemin cho", + "tgt_entity_name": "yujie he", + "relation_name": "", + "weight": 8.0, + "description": "jaemin cho and yujie he are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: jaemin cho\nType: PERSON", + "target": "Name: yujie he\nType: PERSON" + }, + { + "src_entity_name": "jaemin cho", + "tgt_entity_name": "mohit bansal", + "relation_name": "", + "weight": 8.0, + "description": "jaemin cho and mohit bansal are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: jaemin cho\nType: PERSON", + "target": "Name: mohit bansal\nType: PERSON" + }, + { + "src_entity_name": "debanjan mahata", + "tgt_entity_name": "m3docrag", + "relation_name": "", + "weight": 10.0, + "description": "debanjan mahata is an author of the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: debanjan mahata\nType: PERSON", + "target": "Name: m3docrag\nType: PRODUCT" + }, + { + "src_entity_name": "debanjan mahata", + "tgt_entity_name": "ozan irsoy", + "relation_name": "", + "weight": 8.0, + "description": "debanjan mahata and ozan irsoy are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: debanjan mahata\nType: PERSON", + "target": "Name: ozan irsoy\nType: PERSON" + }, + { + "src_entity_name": "debanjan mahata", + "tgt_entity_name": "yujie he", + "relation_name": "", + "weight": 8.0, + "description": "debanjan mahata and yujie he are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: debanjan mahata\nType: PERSON", + "target": "Name: yujie he\nType: PERSON" + }, + { + "src_entity_name": "debanjan mahata", + "tgt_entity_name": "mohit bansal", + "relation_name": "", + "weight": 8.0, + "description": "debanjan mahata and mohit bansal are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: debanjan mahata\nType: PERSON", + "target": "Name: mohit bansal\nType: PERSON" + }, + { + "src_entity_name": "ozan irsoy", + "tgt_entity_name": "m3docrag", + "relation_name": "", + "weight": 10.0, + "description": "ozan irsoy is an author of the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: ozan irsoy\nType: PERSON", + "target": "Name: m3docrag\nType: PRODUCT" + }, + { + "src_entity_name": "ozan irsoy", + "tgt_entity_name": "yujie he", + "relation_name": "", + "weight": 8.0, + "description": "ozan irsoy and yujie he are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: ozan irsoy\nType: PERSON", + "target": "Name: yujie he\nType: PERSON" + }, + { + "src_entity_name": "ozan irsoy", + "tgt_entity_name": "mohit bansal", + "relation_name": "", + "weight": 8.0, + "description": "ozan irsoy and mohit bansal are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: ozan irsoy\nType: PERSON", + "target": "Name: mohit bansal\nType: PERSON" + }, + { + "src_entity_name": "yujie he", + "tgt_entity_name": "m3docrag", + "relation_name": "", + "weight": 10.0, + "description": "yujie he is an author of the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: yujie he\nType: PERSON", + "target": "Name: m3docrag\nType: PRODUCT" + }, + { + "src_entity_name": "yujie he", + "tgt_entity_name": "mohit bansal", + "relation_name": "", + "weight": 8.0, + "description": "yujie he and mohit bansal are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: yujie he\nType: PERSON", + "target": "Name: mohit bansal\nType: PERSON" + }, + { + "src_entity_name": "mohit bansal", + "tgt_entity_name": "m3docrag", + "relation_name": "", + "weight": 10.0, + "description": "mohit bansal is an author of the m3docrag preprint", + "source_ids": [ + 201 + ], + "source": "Name: mohit bansal\nType: PERSON", + "target": "Name: m3docrag\nType: PRODUCT" + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "arxiv 2411 04952", + "relation_name": "", + "weight": 9.0, + "description": "m3docrag is identified by the file type arxiv 2411 04952", + "source_ids": [ + 201 + ], + "source": "Name: m3docrag\nType: PRODUCT", + "target": "Name: arxiv 2411 04952\nType: FILE_TYPE" + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "multi modal retrieval", + "relation_name": "", + "weight": 9.0, + "description": "m3docrag utilizes multi modal retrieval as its core technique", + "source_ids": [ + 201 + ], + "source": "Name: m3docrag\nType: PRODUCT", + "target": "Name: multi modal retrieval\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "multi page multidocument understanding", + "relation_name": "", + "weight": 10.0, + "description": "m3docrag is designed to solve the problem of multi page multidocument understanding", + "source_ids": [ + 201 + ], + "source": "Name: m3docrag\nType: PRODUCT", + "target": "Name: multi page multidocument understanding\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "arxiv preprint", + "relation_name": "", + "weight": 9.0, + "description": "m3docrag was published as an arxiv preprint", + "source_ids": [ + 201 + ], + "source": "Name: m3docrag\nType: PRODUCT", + "target": "Name: arxiv preprint\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "multi modal retrieval", + "tgt_entity_name": "multi page multidocument understanding", + "relation_name": "", + "weight": 8.0, + "description": "multi modal retrieval is identified as the necessary method for achieving multi page multidocument understanding", + "source_ids": [ + 201 + ], + "source": "Name: multi modal retrieval\nType: METHOD_OR_TECHNIQUE", + "target": "Name: multi page multidocument understanding\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "vasilis efthymiou", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ], + "source": "Name: vassilis christophides\nType: PERSON", + "target": "Name: vasilis efthymiou\nType: PERSON" + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "themis palpanas", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ], + "source": "Name: vassilis christophides\nType: PERSON", + "target": "Name: themis palpanas\nType: PERSON" + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "george papadakis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ], + "source": "Name: vassilis christophides\nType: PERSON", + "target": "Name: george papadakis\nType: PERSON" + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "kostas stefanidis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ], + "source": "Name: vassilis christophides\nType: PERSON", + "target": "Name: kostas stefanidis\nType: PERSON" + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 8.0, + "description": "vassilis christophides is an author of a paper published in acm computing surveys", + "source_ids": [ + 202 + ], + "source": "Name: vassilis christophides\nType: PERSON", + "target": "Name: acm computing surveys\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "vassilis christophides is an author of the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ], + "source": "Name: vassilis christophides\nType: PERSON", + "target": "Name: an overview of end to end entity resolution for big data\nType: BOOK" + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "themis palpanas", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ], + "source": "Name: vasilis efthymiou\nType: PERSON", + "target": "Name: themis palpanas\nType: PERSON" + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "george papadakis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ], + "source": "Name: vasilis efthymiou\nType: PERSON", + "target": "Name: george papadakis\nType: PERSON" + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "kostas stefanidis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ], + "source": "Name: vasilis efthymiou\nType: PERSON", + "target": "Name: kostas stefanidis\nType: PERSON" + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 8.0, + "description": "vasilis efthymiou is an author of a paper published in acm computing surveys", + "source_ids": [ + 202 + ], + "source": "Name: vasilis efthymiou\nType: PERSON", + "target": "Name: acm computing surveys\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "vasilis efthymiou is an author of the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ], + "source": "Name: vasilis efthymiou\nType: PERSON", + "target": "Name: an overview of end to end entity resolution for big data\nType: BOOK" + }, + { + "src_entity_name": "themis palpanas", + "tgt_entity_name": "george papadakis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ], + "source": "Name: themis palpanas\nType: PERSON", + "target": "Name: george papadakis\nType: PERSON" + }, + { + "src_entity_name": "themis palpanas", + "tgt_entity_name": "kostas stefanidis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ], + "source": "Name: themis palpanas\nType: PERSON", + "target": "Name: kostas stefanidis\nType: PERSON" + }, + { + "src_entity_name": "themis palpanas", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 8.0, + "description": "themis palpanas is an author of a paper published in acm computing surveys", + "source_ids": [ + 202 + ], + "source": "Name: themis palpanas\nType: PERSON", + "target": "Name: acm computing surveys\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "themis palpanas", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "themis palpanas is an author of the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ], + "source": "Name: themis palpanas\nType: PERSON", + "target": "Name: an overview of end to end entity resolution for big data\nType: BOOK" + }, + { + "src_entity_name": "george papadakis", + "tgt_entity_name": "kostas stefanidis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ], + "source": "Name: george papadakis\nType: PERSON", + "target": "Name: kostas stefanidis\nType: PERSON" + }, + { + "src_entity_name": "george papadakis", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 8.0, + "description": "george papadakis is an author of a paper published in acm computing surveys", + "source_ids": [ + 202 + ], + "source": "Name: george papadakis\nType: PERSON", + "target": "Name: acm computing surveys\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "george papadakis", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "george papadakis is an author of the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ], + "source": "Name: george papadakis\nType: PERSON", + "target": "Name: an overview of end to end entity resolution for big data\nType: BOOK" + }, + { + "src_entity_name": "kostas stefanidis", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 8.0, + "description": "kostas stefanidis is an author of a paper published in acm computing surveys", + "source_ids": [ + 202 + ], + "source": "Name: kostas stefanidis\nType: PERSON", + "target": "Name: acm computing surveys\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "kostas stefanidis", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "kostas stefanidis is an author of the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ], + "source": "Name: kostas stefanidis\nType: PERSON", + "target": "Name: an overview of end to end entity resolution for big data\nType: BOOK" + }, + { + "src_entity_name": "acm computing surveys", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "acm computing surveys is the publication venue for the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ], + "source": "Name: acm computing surveys\nType: PUBLICATION_VENUE", + "target": "Name: an overview of end to end entity resolution for big data\nType: BOOK" + }, + { + "src_entity_name": "acm computing surveys", + "tgt_entity_name": "csur", + "relation_name": "", + "weight": 10.0, + "description": "csur is the abbreviation used for the publication venue acm computing surveys", + "source_ids": [ + 202 + ], + "source": "Name: acm computing surveys\nType: PUBLICATION_VENUE", + "target": "Name: csur\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "acm computing surveys", + "tgt_entity_name": "53", + "relation_name": "", + "weight": 9.0, + "description": "the paper was published in volume 53 of acm computing surveys", + "source_ids": [ + 202 + ], + "source": "Name: acm computing surveys\nType: PUBLICATION_VENUE", + "target": "Name: 53\nType: MEASUREMENT" + }, + { + "src_entity_name": "acm computing surveys", + "tgt_entity_name": "1 42", + "relation_name": "", + "weight": 9.0, + "description": "the paper spans pages 1 42 in acm computing surveys", + "source_ids": [ + 202 + ], + "source": "Name: acm computing surveys\nType: PUBLICATION_VENUE", + "target": "Name: 1 42\nType: MEASUREMENT" + }, + { + "src_entity_name": "an overview of end to end entity resolution for big data", + "tgt_entity_name": "end to end entity resolution", + "relation_name": "", + "weight": 10.0, + "description": "the paper title indicates it provides an overview of the task of end to end entity resolution", + "source_ids": [ + 202 + ], + "source": "Name: an overview of end to end entity resolution for big data\nType: BOOK", + "target": "Name: end to end entity resolution\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "an overview of end to end entity resolution for big data", + "tgt_entity_name": "big data", + "relation_name": "", + "weight": 10.0, + "description": "the paper title indicates it discusses the application of entity resolution to big data", + "source_ids": [ + 202 + ], + "source": "Name: an overview of end to end entity resolution for big data\nType: BOOK", + "target": "Name: big data\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "end to end entity resolution", + "tgt_entity_name": "big data", + "relation_name": "", + "weight": 8.0, + "description": "the text links the task of end to end entity resolution with the domain of big data", + "source_ids": [ + 202 + ], + "source": "Name: end to end entity resolution\nType: TASK_OR_PROBLEM", + "target": "Name: big data\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "eric bieber", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and eric bieber are co authors on the same paper", + "source_ids": [ + 203 + ], + "source": "Name: gheorghe comanici\nType: PERSON", + "target": "Name: eric bieber\nType: PERSON" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "mike schaekermann", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and mike schaekermann are co authors on the same paper", + "source_ids": [ + 203 + ], + "source": "Name: gheorghe comanici\nType: PERSON", + "target": "Name: mike schaekermann\nType: PERSON" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "ice pasupat", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and ice pasupat are co authors on the same paper", + "source_ids": [ + 203 + ], + "source": "Name: gheorghe comanici\nType: PERSON", + "target": "Name: ice pasupat\nType: PERSON" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "noveen sachdeva", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and noveen sachdeva are co authors on the same paper", + "source_ids": [ + 203 + ], + "source": "Name: gheorghe comanici\nType: PERSON", + "target": "Name: noveen sachdeva\nType: PERSON" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "inderjit dhillon", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and inderjit dhillon are co authors on the same paper", + "source_ids": [ + 203 + ], + "source": "Name: gheorghe comanici\nType: PERSON", + "target": "Name: inderjit dhillon\nType: PERSON" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "marcel blistein", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and marcel blistein are co authors on the same paper", + "source_ids": [ + 203 + ], + "source": "Name: gheorghe comanici\nType: PERSON", + "target": "Name: marcel blistein\nType: PERSON" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "ori ram", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and ori ram are co authors on the same paper", + "source_ids": [ + 203 + ], + "source": "Name: gheorghe comanici\nType: PERSON", + "target": "Name: ori ram\nType: PERSON" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "dan zhang", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and dan zhang are co authors on the same paper", + "source_ids": [ + 203 + ], + "source": "Name: gheorghe comanici\nType: PERSON", + "target": "Name: dan zhang\nType: PERSON" + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "evan rosen", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and evan rosen are co authors on the same paper", + "source_ids": [ + 203 + ], + "source": "Name: gheorghe comanici\nType: PERSON", + "target": "Name: evan rosen\nType: PERSON" + }, + { + "src_entity_name": "arxiv 2507 06261", + "tgt_entity_name": "arxiv preprint", + "relation_name": "", + "weight": 10.0, + "description": "arxiv 2507 06261 is identified as an arxiv preprint", + "source_ids": [ + 203 + ], + "source": "Name: arxiv 2507 06261\nType: FILE_TYPE", + "target": "Name: arxiv preprint\nType: FILE_TYPE" + }, + { + "src_entity_name": "arxiv 2402 07630", + "tgt_entity_name": "arxiv preprint", + "relation_name": "", + "weight": 9.0, + "description": "arxiv 2402 07630 is an instance of an arxiv preprint", + "source_ids": [ + 211 + ], + "source": "Name: arxiv preprint\nType: FILE_TYPE", + "target": "Name: arxiv 2402 07630\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "arxiv preprint", + "relation_name": "", + "weight": 9.0, + "description": "g retriever is published as an arxiv preprint", + "source_ids": [ + 211 + ], + "source": "Name: arxiv preprint\nType: FILE_TYPE", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "pradeep dasigi", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "pradeep dasigi is an author of the dataset work", + "source_ids": [ + 204 + ], + "source": "Name: pradeep dasigi\nType: PERSON", + "target": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT" + }, + { + "src_entity_name": "pradeep dasigi", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "pradeep dasigi is an author of the work published in this venue", + "source_ids": [ + 204 + ], + "source": "Name: pradeep dasigi\nType: PERSON", + "target": "Name: arxiv preprint arxiv 2105 03011\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "kyle lo", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "kyle lo is an author of the dataset work", + "source_ids": [ + 204 + ], + "source": "Name: kyle lo\nType: PERSON", + "target": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT" + }, + { + "src_entity_name": "kyle lo", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "kyle lo is an author of the work published in this venue", + "source_ids": [ + 204 + ], + "source": "Name: kyle lo\nType: PERSON", + "target": "Name: arxiv preprint arxiv 2105 03011\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "iz beltagy", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "iz beltagy is an author of the dataset work", + "source_ids": [ + 204 + ], + "source": "Name: iz beltagy\nType: PERSON", + "target": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT" + }, + { + "src_entity_name": "iz beltagy", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "iz beltagy is an author of the work published in this venue", + "source_ids": [ + 204 + ], + "source": "Name: iz beltagy\nType: PERSON", + "target": "Name: arxiv preprint arxiv 2105 03011\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "arman cohan", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "arman cohan is an author of the dataset work", + "source_ids": [ + 204 + ], + "source": "Name: arman cohan\nType: PERSON", + "target": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT" + }, + { + "src_entity_name": "arman cohan", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "arman cohan is an author of the work published in this venue", + "source_ids": [ + 204 + ], + "source": "Name: arman cohan\nType: PERSON", + "target": "Name: arxiv preprint arxiv 2105 03011\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "noah a smith", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "noah a smith is an author of the dataset work", + "source_ids": [ + 204 + ], + "source": "Name: noah a smith\nType: PERSON", + "target": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT" + }, + { + "src_entity_name": "noah a smith", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "noah a smith is an author of the work published in this venue", + "source_ids": [ + 204 + ], + "source": "Name: noah a smith\nType: PERSON", + "target": "Name: arxiv preprint arxiv 2105 03011\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "matt gardner", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "matt gardner is an author of the dataset work", + "source_ids": [ + 204 + ], + "source": "Name: matt gardner\nType: PERSON", + "target": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT" + }, + { + "src_entity_name": "matt gardner", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "matt gardner is an author of the work published in this venue", + "source_ids": [ + 204 + ], + "source": "Name: matt gardner\nType: PERSON", + "target": "Name: arxiv preprint arxiv 2105 03011\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 10.0, + "description": "the dataset work is published as the arxiv preprint arxiv 2105 03011", + "source_ids": [ + 204 + ], + "source": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT", + "target": "Name: arxiv preprint arxiv 2105 03011\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "tgt_entity_name": "2021", + "relation_name": "", + "weight": 10.0, + "description": "the dataset work was published in the year 2021", + "source_ids": [ + 204 + ], + "source": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT", + "target": "Name: 2021\nType: DATE" + }, + { + "src_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "tgt_entity_name": "research papers", + "relation_name": "", + "weight": 10.0, + "description": "the dataset is anchored in research papers meaning it derives its content from them", + "source_ids": [ + 204 + ], + "source": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT", + "target": "Name: research papers\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "tgt_entity_name": "information seeking questions", + "relation_name": "", + "weight": 10.0, + "description": "the dataset consists of information seeking questions", + "source_ids": [ + 204 + ], + "source": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT", + "target": "Name: information seeking questions\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "tgt_entity_name": "answers", + "relation_name": "", + "weight": 10.0, + "description": "the dataset consists of answers corresponding to the questions", + "source_ids": [ + 204 + ], + "source": "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT", + "target": "Name: answers\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "patrice bellot", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ], + "source": "Name: xavier daull\nType: PERSON", + "target": "Name: patrice bellot\nType: PERSON" + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "emmanuel bruno", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ], + "source": "Name: xavier daull\nType: PERSON", + "target": "Name: emmanuel bruno\nType: PERSON" + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "vincent martin", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ], + "source": "Name: xavier daull\nType: PERSON", + "target": "Name: vincent martin\nType: PERSON" + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "elisabeth murisasco", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ], + "source": "Name: xavier daull\nType: PERSON", + "target": "Name: elisabeth murisasco\nType: PERSON" + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "arxiv preprint arxiv 2302 09051", + "relation_name": "", + "weight": 8.0, + "description": "xavier daull is an author of the work identified by this preprint number", + "source_ids": [ + 205 + ], + "source": "Name: xavier daull\nType: PERSON", + "target": "Name: arxiv preprint arxiv 2302 09051\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "complex qa and language models hybrid architectures survey", + "relation_name": "", + "weight": 10.0, + "description": "xavier daull is the author of this specific survey title", + "source_ids": [ + 205 + ], + "source": "Name: xavier daull\nType: PERSON", + "target": "Name: complex qa and language models hybrid architectures survey\nType: BOOK" + }, + { + "src_entity_name": "patrice bellot", + "tgt_entity_name": "emmanuel bruno", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ], + "source": "Name: patrice bellot\nType: PERSON", + "target": "Name: emmanuel bruno\nType: PERSON" + }, + { + "src_entity_name": "patrice bellot", + "tgt_entity_name": "vincent martin", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ], + "source": "Name: patrice bellot\nType: PERSON", + "target": "Name: vincent martin\nType: PERSON" + }, + { + "src_entity_name": "patrice bellot", + "tgt_entity_name": "elisabeth murisasco", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ], + "source": "Name: patrice bellot\nType: PERSON", + "target": "Name: elisabeth murisasco\nType: PERSON" + }, + { + "src_entity_name": "emmanuel bruno", + "tgt_entity_name": "vincent martin", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ], + "source": "Name: emmanuel bruno\nType: PERSON", + "target": "Name: vincent martin\nType: PERSON" + }, + { + "src_entity_name": "emmanuel bruno", + "tgt_entity_name": "elisabeth murisasco", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ], + "source": "Name: emmanuel bruno\nType: PERSON", + "target": "Name: elisabeth murisasco\nType: PERSON" + }, + { + "src_entity_name": "vincent martin", + "tgt_entity_name": "elisabeth murisasco", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ], + "source": "Name: vincent martin\nType: PERSON", + "target": "Name: elisabeth murisasco\nType: PERSON" + }, + { + "src_entity_name": "arxiv preprint arxiv 2302 09051", + "tgt_entity_name": "2302 09051", + "relation_name": "", + "weight": 10.0, + "description": "the preprint identifier contains the specific code 2302 09051", + "source_ids": [ + 205 + ], + "source": "Name: arxiv preprint arxiv 2302 09051\nType: PUBLICATION_VENUE", + "target": "Name: 2302 09051\nType: FILE_TYPE" + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "ha trinh", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: darren edge\nType: PERSON", + "target": "Name: ha trinh\nType: PERSON" + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "newman cheng", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: darren edge\nType: PERSON", + "target": "Name: newman cheng\nType: PERSON" + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "joshua bradley", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: darren edge\nType: PERSON", + "target": "Name: joshua bradley\nType: PERSON" + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "alex chao", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: darren edge\nType: PERSON", + "target": "Name: alex chao\nType: PERSON" + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "apurva mody", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: darren edge\nType: PERSON", + "target": "Name: apurva mody\nType: PERSON" + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: darren edge\nType: PERSON", + "target": "Name: steven truitt\nType: PERSON" + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: darren edge\nType: PERSON", + "target": "Name: jonathan larson\nType: PERSON" + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "darren edge is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ], + "source": "Name: darren edge\nType: PERSON", + "target": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK" + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "newman cheng", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: ha trinh\nType: PERSON", + "target": "Name: newman cheng\nType: PERSON" + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "joshua bradley", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: ha trinh\nType: PERSON", + "target": "Name: joshua bradley\nType: PERSON" + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "alex chao", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: ha trinh\nType: PERSON", + "target": "Name: alex chao\nType: PERSON" + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "apurva mody", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: ha trinh\nType: PERSON", + "target": "Name: apurva mody\nType: PERSON" + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: ha trinh\nType: PERSON", + "target": "Name: steven truitt\nType: PERSON" + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: ha trinh\nType: PERSON", + "target": "Name: jonathan larson\nType: PERSON" + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "ha trinh is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ], + "source": "Name: ha trinh\nType: PERSON", + "target": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK" + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "joshua bradley", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: newman cheng\nType: PERSON", + "target": "Name: joshua bradley\nType: PERSON" + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "alex chao", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: newman cheng\nType: PERSON", + "target": "Name: alex chao\nType: PERSON" + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "apurva mody", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: newman cheng\nType: PERSON", + "target": "Name: apurva mody\nType: PERSON" + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: newman cheng\nType: PERSON", + "target": "Name: steven truitt\nType: PERSON" + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: newman cheng\nType: PERSON", + "target": "Name: jonathan larson\nType: PERSON" + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "newman cheng is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ], + "source": "Name: newman cheng\nType: PERSON", + "target": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK" + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "alex chao", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: joshua bradley\nType: PERSON", + "target": "Name: alex chao\nType: PERSON" + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "apurva mody", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: joshua bradley\nType: PERSON", + "target": "Name: apurva mody\nType: PERSON" + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: joshua bradley\nType: PERSON", + "target": "Name: steven truitt\nType: PERSON" + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: joshua bradley\nType: PERSON", + "target": "Name: jonathan larson\nType: PERSON" + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "joshua bradley is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ], + "source": "Name: joshua bradley\nType: PERSON", + "target": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK" + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "apurva mody", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: alex chao\nType: PERSON", + "target": "Name: apurva mody\nType: PERSON" + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: alex chao\nType: PERSON", + "target": "Name: steven truitt\nType: PERSON" + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: alex chao\nType: PERSON", + "target": "Name: jonathan larson\nType: PERSON" + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "alex chao is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ], + "source": "Name: alex chao\nType: PERSON", + "target": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK" + }, + { + "src_entity_name": "apurva mody", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: apurva mody\nType: PERSON", + "target": "Name: steven truitt\nType: PERSON" + }, + { + "src_entity_name": "apurva mody", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: apurva mody\nType: PERSON", + "target": "Name: jonathan larson\nType: PERSON" + }, + { + "src_entity_name": "apurva mody", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "apurva mody is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ], + "source": "Name: apurva mody\nType: PERSON", + "target": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK" + }, + { + "src_entity_name": "steven truitt", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ], + "source": "Name: steven truitt\nType: PERSON", + "target": "Name: jonathan larson\nType: PERSON" + }, + { + "src_entity_name": "steven truitt", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "steven truitt is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ], + "source": "Name: steven truitt\nType: PERSON", + "target": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK" + }, + { + "src_entity_name": "jonathan larson", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "jonathan larson is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ], + "source": "Name: jonathan larson\nType: PERSON", + "target": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK" + }, + { + "src_entity_name": "from local to global a graph rag approach to query focused summarization", + "tgt_entity_name": "arxiv 2404 16130", + "relation_name": "", + "weight": 10.0, + "description": "the document from local to global a graph rag approach to query focused summarization is identified by the preprint number arxiv 2404 16130", + "source_ids": [ + 206 + ], + "source": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK", + "target": "Name: arxiv 2404 16130\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "from local to global a graph rag approach to query focused summarization", + "tgt_entity_name": "graph rag", + "relation_name": "", + "weight": 10.0, + "description": "the paper title explicitly names graph rag as the core approach discussed", + "source_ids": [ + 206 + ], + "source": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK", + "target": "Name: graph rag\nType: TECHNOLOGY" + }, + { + "src_entity_name": "from local to global a graph rag approach to query focused summarization", + "tgt_entity_name": "query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "the paper title explicitly names query focused summarization as the target task", + "source_ids": [ + 206 + ], + "source": "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK", + "target": "Name: query focused summarization\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "graph rag", + "tgt_entity_name": "query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "graph rag is the approach used to solve the task of query focused summarization", + "source_ids": [ + 206 + ], + "source": "Name: graph rag\nType: TECHNOLOGY", + "target": "Name: query focused summarization\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "graph rag", + "tgt_entity_name": "local", + "relation_name": "", + "weight": 7.0, + "description": "the graph rag approach is described as a transition from local to global implying it handles local data", + "source_ids": [ + 206 + ], + "source": "Name: graph rag\nType: TECHNOLOGY", + "target": "Name: local\nType: CONCEPT" + }, + { + "src_entity_name": "graph rag", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 7.0, + "description": "the graph rag approach is described as a transition from local to global implying it handles global data", + "source_ids": [ + 206 + ], + "source": "Name: graph rag\nType: TECHNOLOGY", + "target": "Name: global\nType: CONCEPT" + }, + { + "src_entity_name": "example", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 9.0, + "description": "the example task is defined by the global process of filtering for tables", + "source_ids": [ + 251 + ], + "source": "Name: global\nType: CONCEPT", + "target": "Name: example\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "yunfan gao is an author of the survey", + "source_ids": [ + 207 + ], + "source": "Name: yunfan gao\nType: PERSON", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "yun xiong", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and yun xiong are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yunfan gao\nType: PERSON", + "target": "Name: yun xiong\nType: PERSON" + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "xinyu gao", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and xinyu gao are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yunfan gao\nType: PERSON", + "target": "Name: xinyu gao\nType: PERSON" + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "kangxiang jia", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and kangxiang jia are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yunfan gao\nType: PERSON", + "target": "Name: kangxiang jia\nType: PERSON" + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "jinliu pan", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and jinliu pan are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yunfan gao\nType: PERSON", + "target": "Name: jinliu pan\nType: PERSON" + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "yuxi bi", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and yuxi bi are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yunfan gao\nType: PERSON", + "target": "Name: yuxi bi\nType: PERSON" + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yunfan gao\nType: PERSON", + "target": "Name: yi dai\nType: PERSON" + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yunfan gao\nType: PERSON", + "target": "Name: jiawei sun\nType: PERSON" + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yunfan gao\nType: PERSON", + "target": "Name: haofen wang\nType: PERSON" + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "yun xiong is an author of the survey", + "source_ids": [ + 207 + ], + "source": "Name: yun xiong\nType: PERSON", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "xinyu gao", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and xinyu gao are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yun xiong\nType: PERSON", + "target": "Name: xinyu gao\nType: PERSON" + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "kangxiang jia", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and kangxiang jia are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yun xiong\nType: PERSON", + "target": "Name: kangxiang jia\nType: PERSON" + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "jinliu pan", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and jinliu pan are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yun xiong\nType: PERSON", + "target": "Name: jinliu pan\nType: PERSON" + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "yuxi bi", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and yuxi bi are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yun xiong\nType: PERSON", + "target": "Name: yuxi bi\nType: PERSON" + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yun xiong\nType: PERSON", + "target": "Name: yi dai\nType: PERSON" + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yun xiong\nType: PERSON", + "target": "Name: jiawei sun\nType: PERSON" + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yun xiong\nType: PERSON", + "target": "Name: haofen wang\nType: PERSON" + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "xinyu gao is an author of the survey", + "source_ids": [ + 207 + ], + "source": "Name: xinyu gao\nType: PERSON", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "kangxiang jia", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and kangxiang jia are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: xinyu gao\nType: PERSON", + "target": "Name: kangxiang jia\nType: PERSON" + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "jinliu pan", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and jinliu pan are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: xinyu gao\nType: PERSON", + "target": "Name: jinliu pan\nType: PERSON" + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "yuxi bi", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and yuxi bi are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: xinyu gao\nType: PERSON", + "target": "Name: yuxi bi\nType: PERSON" + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: xinyu gao\nType: PERSON", + "target": "Name: yi dai\nType: PERSON" + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: xinyu gao\nType: PERSON", + "target": "Name: jiawei sun\nType: PERSON" + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: xinyu gao\nType: PERSON", + "target": "Name: haofen wang\nType: PERSON" + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "kangxiang jia is an author of the survey", + "source_ids": [ + 207 + ], + "source": "Name: kangxiang jia\nType: PERSON", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "jinliu pan", + "relation_name": "", + "weight": 8.0, + "description": "kangxiang jia and jinliu pan are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: kangxiang jia\nType: PERSON", + "target": "Name: jinliu pan\nType: PERSON" + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "yuxi bi", + "relation_name": "", + "weight": 8.0, + "description": "kangxiang jia and yuxi bi are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: kangxiang jia\nType: PERSON", + "target": "Name: yuxi bi\nType: PERSON" + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "kangxiang jia and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: kangxiang jia\nType: PERSON", + "target": "Name: yi dai\nType: PERSON" + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "kangxiang jia and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: kangxiang jia\nType: PERSON", + "target": "Name: jiawei sun\nType: PERSON" + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "kangxiang jia and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: kangxiang jia\nType: PERSON", + "target": "Name: haofen wang\nType: PERSON" + }, + { + "src_entity_name": "jinliu pan", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "jinliu pan is an author of the survey", + "source_ids": [ + 207 + ], + "source": "Name: jinliu pan\nType: PERSON", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "jinliu pan", + "tgt_entity_name": "yuxi bi", + "relation_name": "", + "weight": 8.0, + "description": "jinliu pan and yuxi bi are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: jinliu pan\nType: PERSON", + "target": "Name: yuxi bi\nType: PERSON" + }, + { + "src_entity_name": "jinliu pan", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "jinliu pan and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: jinliu pan\nType: PERSON", + "target": "Name: yi dai\nType: PERSON" + }, + { + "src_entity_name": "jinliu pan", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "jinliu pan and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: jinliu pan\nType: PERSON", + "target": "Name: jiawei sun\nType: PERSON" + }, + { + "src_entity_name": "jinliu pan", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "jinliu pan and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: jinliu pan\nType: PERSON", + "target": "Name: haofen wang\nType: PERSON" + }, + { + "src_entity_name": "yuxi bi", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "yuxi bi is an author of the survey", + "source_ids": [ + 207 + ], + "source": "Name: yuxi bi\nType: PERSON", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "yuxi bi", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "yuxi bi and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yuxi bi\nType: PERSON", + "target": "Name: yi dai\nType: PERSON" + }, + { + "src_entity_name": "yuxi bi", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "yuxi bi and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yuxi bi\nType: PERSON", + "target": "Name: jiawei sun\nType: PERSON" + }, + { + "src_entity_name": "yuxi bi", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "yuxi bi and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yuxi bi\nType: PERSON", + "target": "Name: haofen wang\nType: PERSON" + }, + { + "src_entity_name": "yi dai", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "yi dai is an author of the survey", + "source_ids": [ + 207 + ], + "source": "Name: yi dai\nType: PERSON", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "yi dai", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "yi dai and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yi dai\nType: PERSON", + "target": "Name: jiawei sun\nType: PERSON" + }, + { + "src_entity_name": "yi dai", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "yi dai and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: yi dai\nType: PERSON", + "target": "Name: haofen wang\nType: PERSON" + }, + { + "src_entity_name": "jiawei sun", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "jiawei sun is an author of the survey", + "source_ids": [ + 207 + ], + "source": "Name: jiawei sun\nType: PERSON", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "jiawei sun", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "jiawei sun and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ], + "source": "Name: jiawei sun\nType: PERSON", + "target": "Name: haofen wang\nType: PERSON" + }, + { + "src_entity_name": "haofen wang", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "haofen wang is an author of the survey", + "source_ids": [ + 207 + ], + "source": "Name: haofen wang\nType: PERSON", + "target": "Name: retrieval augmented generation for large language models a survey\nType: BOOK" + }, + { + "src_entity_name": "retrieval augmented generation for large language models a survey", + "tgt_entity_name": "arxiv preprint arxiv 2312 10997", + "relation_name": "", + "weight": 10.0, + "description": "the survey is identified as the arxiv preprint with the number 2312 10997", + "source_ids": [ + 207 + ], + "source": "Name: retrieval augmented generation for large language models a survey\nType: BOOK", + "target": "Name: arxiv preprint arxiv 2312 10997\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "arxiv preprint arxiv 2312 10997", + "tgt_entity_name": "2312 10997", + "relation_name": "", + "weight": 10.0, + "description": "2312 10997 is the specific identifier for the arxiv preprint", + "source_ids": [ + 207 + ], + "source": "Name: arxiv preprint arxiv 2312 10997\nType: PUBLICATION_VENUE", + "target": "Name: 2312 10997\nType: FILE_TYPE" + }, + { + "src_entity_name": "zirui guo", + "tgt_entity_name": "lightrag", + "relation_name": "", + "weight": 10.0, + "description": "zirui guo is an author of the lightrag paper", + "source_ids": [ + 208 + ], + "source": "Name: zirui guo\nType: PERSON", + "target": "Name: lightrag\nType: PRODUCT" + }, + { + "src_entity_name": "zirui guo", + "tgt_entity_name": "lianghao xia", + "relation_name": "", + "weight": 8.0, + "description": "zirui guo and lianghao xia are co authors on the same paper", + "source_ids": [ + 208 + ], + "source": "Name: zirui guo\nType: PERSON", + "target": "Name: lianghao xia\nType: PERSON" + }, + { + "src_entity_name": "zirui guo", + "tgt_entity_name": "tu ao", + "relation_name": "", + "weight": 8.0, + "description": "zirui guo and tu ao are co authors on the same paper", + "source_ids": [ + 208 + ], + "source": "Name: zirui guo\nType: PERSON", + "target": "Name: tu ao\nType: PERSON" + }, + { + "src_entity_name": "zirui guo", + "tgt_entity_name": "chao huang", + "relation_name": "", + "weight": 8.0, + "description": "zirui guo and chao huang are co authors on the same paper", + "source_ids": [ + 208 + ], + "source": "Name: zirui guo\nType: PERSON", + "target": "Name: chao huang\nType: PERSON" + }, + { + "src_entity_name": "lianghao xia", + "tgt_entity_name": "lightrag", + "relation_name": "", + "weight": 10.0, + "description": "lianghao xia is an author of the lightrag paper", + "source_ids": [ + 208 + ], + "source": "Name: lianghao xia\nType: PERSON", + "target": "Name: lightrag\nType: PRODUCT" + }, + { + "src_entity_name": "lianghao xia", + "tgt_entity_name": "yanhua yu", + "relation_name": "", + "weight": 8.0, + "description": "lianghao xia and yanhua yu are co authors on the same paper", + "source_ids": [ + 208 + ], + "source": "Name: lianghao xia\nType: PERSON", + "target": "Name: yanhua yu\nType: PERSON" + }, + { + "src_entity_name": "lianghao xia", + "tgt_entity_name": "tu ao", + "relation_name": "", + "weight": 8.0, + "description": "lianghao xia and tu ao are co authors on the same paper", + "source_ids": [ + 208 + ], + "source": "Name: lianghao xia\nType: PERSON", + "target": "Name: tu ao\nType: PERSON" + }, + { + "src_entity_name": "lianghao xia", + "tgt_entity_name": "chao huang", + "relation_name": "", + "weight": 8.0, + "description": "lianghao xia and chao huang are co authors on the same paper", + "source_ids": [ + 208 + ], + "source": "Name: lianghao xia\nType: PERSON", + "target": "Name: chao huang\nType: PERSON" + }, + { + "src_entity_name": "yanhua yu", + "tgt_entity_name": "lightrag", + "relation_name": "", + "weight": 10.0, + "description": "yanhua yu is an author of the lightrag paper", + "source_ids": [ + 208 + ], + "source": "Name: yanhua yu\nType: PERSON", + "target": "Name: lightrag\nType: PRODUCT" + }, + { + "src_entity_name": "yanhua yu", + "tgt_entity_name": "tu ao", + "relation_name": "", + "weight": 8.0, + "description": "yanhua yu and tu ao are co authors on the same paper", + "source_ids": [ + 208 + ], + "source": "Name: yanhua yu\nType: PERSON", + "target": "Name: tu ao\nType: PERSON" + }, + { + "src_entity_name": "yanhua yu", + "tgt_entity_name": "chao huang", + "relation_name": "", + "weight": 8.0, + "description": "yanhua yu and chao huang are co authors on the same paper", + "source_ids": [ + 208 + ], + "source": "Name: yanhua yu\nType: PERSON", + "target": "Name: chao huang\nType: PERSON" + }, + { + "src_entity_name": "tu ao", + "tgt_entity_name": "lightrag", + "relation_name": "", + "weight": 10.0, + "description": "tu ao is an author of the lightrag paper", + "source_ids": [ + 208 + ], + "source": "Name: tu ao\nType: PERSON", + "target": "Name: lightrag\nType: PRODUCT" + }, + { + "src_entity_name": "tu ao", + "tgt_entity_name": "chao huang", + "relation_name": "", + "weight": 8.0, + "description": "tu ao and chao huang are co authors on the same paper", + "source_ids": [ + 208 + ], + "source": "Name: tu ao\nType: PERSON", + "target": "Name: chao huang\nType: PERSON" + }, + { + "src_entity_name": "chao huang", + "tgt_entity_name": "lightrag", + "relation_name": "", + "weight": 10.0, + "description": "chao huang is an author of the lightrag paper", + "source_ids": [ + 208 + ], + "source": "Name: chao huang\nType: PERSON", + "target": "Name: lightrag\nType: PRODUCT" + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "arxiv e prints", + "relation_name": "", + "weight": 9.0, + "description": "lightrag was published in arxiv e prints", + "source_ids": [ + 208 + ], + "source": "Name: lightrag\nType: PRODUCT", + "target": "Name: arxiv e prints\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "arxiv2410", + "relation_name": "", + "weight": 9.0, + "description": "lightrag is identified by the arxiv identifier arxiv2410", + "source_ids": [ + 208 + ], + "source": "Name: lightrag\nType: PRODUCT", + "target": "Name: arxiv2410\nType: FILE_TYPE" + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "simple", + "relation_name": "", + "weight": 8.0, + "description": "lightrag is described as being simple", + "source_ids": [ + 208 + ], + "source": "Name: lightrag\nType: PRODUCT", + "target": "Name: simple\nType: CONCEPT" + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "fast", + "relation_name": "", + "weight": 8.0, + "description": "lightrag is described as being fast", + "source_ids": [ + 208 + ], + "source": "Name: lightrag\nType: PRODUCT", + "target": "Name: fast\nType: CONCEPT" + }, + { + "src_entity_name": "bernal jim nez guti rrez", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "bernal jim nez guti rrez is an author of the paper describing hipporag", + "source_ids": [ + 209 + ], + "source": "Name: bernal jim nez guti rrez\nType: PERSON", + "target": "Name: hipporag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "bernal jim nez guti rrez", + "tgt_entity_name": "yiheng shu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ], + "source": "Name: bernal jim nez guti rrez\nType: PERSON", + "target": "Name: yiheng shu\nType: PERSON" + }, + { + "src_entity_name": "bernal jim nez guti rrez", + "tgt_entity_name": "yu gu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ], + "source": "Name: bernal jim nez guti rrez\nType: PERSON", + "target": "Name: yu gu\nType: PERSON" + }, + { + "src_entity_name": "bernal jim nez guti rrez", + "tgt_entity_name": "michihiro yasunaga", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ], + "source": "Name: bernal jim nez guti rrez\nType: PERSON", + "target": "Name: michihiro yasunaga\nType: PERSON" + }, + { + "src_entity_name": "bernal jim nez guti rrez", + "tgt_entity_name": "yu su", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ], + "source": "Name: bernal jim nez guti rrez\nType: PERSON", + "target": "Name: yu su\nType: PERSON" + }, + { + "src_entity_name": "yiheng shu", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "yiheng shu is an author of the paper describing hipporag", + "source_ids": [ + 209 + ], + "source": "Name: yiheng shu\nType: PERSON", + "target": "Name: hipporag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "yiheng shu", + "tgt_entity_name": "yu gu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ], + "source": "Name: yiheng shu\nType: PERSON", + "target": "Name: yu gu\nType: PERSON" + }, + { + "src_entity_name": "yiheng shu", + "tgt_entity_name": "michihiro yasunaga", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ], + "source": "Name: yiheng shu\nType: PERSON", + "target": "Name: michihiro yasunaga\nType: PERSON" + }, + { + "src_entity_name": "yiheng shu", + "tgt_entity_name": "yu su", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ], + "source": "Name: yiheng shu\nType: PERSON", + "target": "Name: yu su\nType: PERSON" + }, + { + "src_entity_name": "yu gu", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "yu gu is an author of the paper describing hipporag", + "source_ids": [ + 209 + ], + "source": "Name: yu gu\nType: PERSON", + "target": "Name: hipporag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "yu gu", + "tgt_entity_name": "michihiro yasunaga", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ], + "source": "Name: yu gu\nType: PERSON", + "target": "Name: michihiro yasunaga\nType: PERSON" + }, + { + "src_entity_name": "yu gu", + "tgt_entity_name": "yu su", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ], + "source": "Name: yu gu\nType: PERSON", + "target": "Name: yu su\nType: PERSON" + }, + { + "src_entity_name": "michihiro yasunaga", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "michihiro yasunaga is an author of the paper describing hipporag", + "source_ids": [ + 209 + ], + "source": "Name: michihiro yasunaga\nType: PERSON", + "target": "Name: hipporag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "michihiro yasunaga", + "tgt_entity_name": "yu su", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ], + "source": "Name: michihiro yasunaga\nType: PERSON", + "target": "Name: yu su\nType: PERSON" + }, + { + "src_entity_name": "yu su", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "yu su is an author of the paper describing hipporag", + "source_ids": [ + 209 + ], + "source": "Name: yu su\nType: PERSON", + "target": "Name: hipporag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "hipporag", + "tgt_entity_name": "large language models", + "relation_name": "", + "weight": 10.0, + "description": "hipporag is explicitly designed to provide long term memory capabilities for large language models", + "source_ids": [ + 209 + ], + "source": "Name: hipporag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: large language models\nType: PRODUCT" + }, + { + "src_entity_name": "hipporag", + "tgt_entity_name": "neurobiologically inspired long term memory", + "relation_name": "", + "weight": 10.0, + "description": "hipporag is defined as a system for neurobiologically inspired long term memory", + "source_ids": [ + 209 + ], + "source": "Name: hipporag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: neurobiologically inspired long term memory\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "taher h haveliwala", + "tgt_entity_name": "topic sensitive pagerank", + "relation_name": "", + "weight": 10.0, + "description": "taher h haveliwala is the author of the paper topic sensitive pagerank", + "source_ids": [ + 210 + ], + "source": "Name: taher h haveliwala\nType: PERSON", + "target": "Name: topic sensitive pagerank\nType: TECHNOLOGY" + }, + { + "src_entity_name": "taher h haveliwala", + "tgt_entity_name": "2002", + "relation_name": "", + "weight": 8.0, + "description": "taher h haveliwala published the paper in the year 2002", + "source_ids": [ + 210 + ], + "source": "Name: taher h haveliwala\nType: PERSON", + "target": "Name: 2002\nType: DATE" + }, + { + "src_entity_name": "topic sensitive pagerank", + "tgt_entity_name": "11th international conference on world wide web", + "relation_name": "", + "weight": 9.0, + "description": "the paper topic sensitive pagerank was presented at the 11th international conference on world wide web", + "source_ids": [ + 210 + ], + "source": "Name: topic sensitive pagerank\nType: TECHNOLOGY", + "target": "Name: 11th international conference on world wide web\nType: EVENT" + }, + { + "src_entity_name": "topic sensitive pagerank", + "tgt_entity_name": "517 526", + "relation_name": "", + "weight": 8.0, + "description": "the paper topic sensitive pagerank spans pages 517 to 526 in the proceedings", + "source_ids": [ + 210 + ], + "source": "Name: topic sensitive pagerank\nType: TECHNOLOGY", + "target": "Name: 517 526\nType: MEASUREMENT" + }, + { + "src_entity_name": "11th international conference on world wide web", + "tgt_entity_name": "world wide web", + "relation_name": "", + "weight": 9.0, + "description": "the conference is named after and focused on the world wide web technology", + "source_ids": [ + 210 + ], + "source": "Name: 11th international conference on world wide web\nType: EVENT", + "target": "Name: world wide web\nType: TECHNOLOGY" + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "yijun tian", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: xiaoxin he\nType: PERSON", + "target": "Name: yijun tian\nType: PERSON" + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "yifei sun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: xiaoxin he\nType: PERSON", + "target": "Name: yifei sun\nType: PERSON" + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "nitesh v chawla", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: xiaoxin he\nType: PERSON", + "target": "Name: nitesh v chawla\nType: PERSON" + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "thomas laurent", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: xiaoxin he\nType: PERSON", + "target": "Name: thomas laurent\nType: PERSON" + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "yann lecun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: xiaoxin he\nType: PERSON", + "target": "Name: yann lecun\nType: PERSON" + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: xiaoxin he\nType: PERSON", + "target": "Name: xavier bresson\nType: PERSON" + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: xiaoxin he\nType: PERSON", + "target": "Name: bryan hooi\nType: PERSON" + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "xiaoxin he is an author of the paper describing g retriever", + "source_ids": [ + 211 + ], + "source": "Name: xiaoxin he\nType: PERSON", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "yifei sun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yijun tian\nType: PERSON", + "target": "Name: yifei sun\nType: PERSON" + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "nitesh v chawla", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yijun tian\nType: PERSON", + "target": "Name: nitesh v chawla\nType: PERSON" + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "thomas laurent", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yijun tian\nType: PERSON", + "target": "Name: thomas laurent\nType: PERSON" + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "yann lecun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yijun tian\nType: PERSON", + "target": "Name: yann lecun\nType: PERSON" + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yijun tian\nType: PERSON", + "target": "Name: xavier bresson\nType: PERSON" + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yijun tian\nType: PERSON", + "target": "Name: bryan hooi\nType: PERSON" + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "yijun tian is an author of the paper describing g retriever", + "source_ids": [ + 211 + ], + "source": "Name: yijun tian\nType: PERSON", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "nitesh v chawla", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yifei sun\nType: PERSON", + "target": "Name: nitesh v chawla\nType: PERSON" + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "thomas laurent", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yifei sun\nType: PERSON", + "target": "Name: thomas laurent\nType: PERSON" + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "yann lecun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yifei sun\nType: PERSON", + "target": "Name: yann lecun\nType: PERSON" + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yifei sun\nType: PERSON", + "target": "Name: xavier bresson\nType: PERSON" + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yifei sun\nType: PERSON", + "target": "Name: bryan hooi\nType: PERSON" + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "yifei sun is an author of the paper describing g retriever", + "source_ids": [ + 211 + ], + "source": "Name: yifei sun\nType: PERSON", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "nitesh v chawla", + "tgt_entity_name": "thomas laurent", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: nitesh v chawla\nType: PERSON", + "target": "Name: thomas laurent\nType: PERSON" + }, + { + "src_entity_name": "nitesh v chawla", + "tgt_entity_name": "yann lecun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: nitesh v chawla\nType: PERSON", + "target": "Name: yann lecun\nType: PERSON" + }, + { + "src_entity_name": "nitesh v chawla", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: nitesh v chawla\nType: PERSON", + "target": "Name: xavier bresson\nType: PERSON" + }, + { + "src_entity_name": "nitesh v chawla", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: nitesh v chawla\nType: PERSON", + "target": "Name: bryan hooi\nType: PERSON" + }, + { + "src_entity_name": "nitesh v chawla", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "nitesh v chawla is an author of the paper describing g retriever", + "source_ids": [ + 211 + ], + "source": "Name: nitesh v chawla\nType: PERSON", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "thomas laurent", + "tgt_entity_name": "yann lecun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: thomas laurent\nType: PERSON", + "target": "Name: yann lecun\nType: PERSON" + }, + { + "src_entity_name": "thomas laurent", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: thomas laurent\nType: PERSON", + "target": "Name: xavier bresson\nType: PERSON" + }, + { + "src_entity_name": "thomas laurent", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: thomas laurent\nType: PERSON", + "target": "Name: bryan hooi\nType: PERSON" + }, + { + "src_entity_name": "thomas laurent", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "thomas laurent is an author of the paper describing g retriever", + "source_ids": [ + 211 + ], + "source": "Name: thomas laurent\nType: PERSON", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "yann lecun", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yann lecun\nType: PERSON", + "target": "Name: xavier bresson\nType: PERSON" + }, + { + "src_entity_name": "yann lecun", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: yann lecun\nType: PERSON", + "target": "Name: bryan hooi\nType: PERSON" + }, + { + "src_entity_name": "yann lecun", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "yann lecun is an author of the paper describing g retriever", + "source_ids": [ + 211 + ], + "source": "Name: yann lecun\nType: PERSON", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "xavier bresson", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ], + "source": "Name: xavier bresson\nType: PERSON", + "target": "Name: bryan hooi\nType: PERSON" + }, + { + "src_entity_name": "xavier bresson", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "xavier bresson is an author of the paper describing g retriever", + "source_ids": [ + 211 + ], + "source": "Name: xavier bresson\nType: PERSON", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "bryan hooi", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "bryan hooi is an author of the paper describing g retriever", + "source_ids": [ + 211 + ], + "source": "Name: bryan hooi\nType: PERSON", + "target": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "arxiv 2402 07630", + "relation_name": "", + "weight": 9.0, + "description": "g retriever is identified by the preprint number arxiv 2402 07630", + "source_ids": [ + 211 + ], + "source": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: arxiv 2402 07630\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "textual graph understanding", + "relation_name": "", + "weight": 10.0, + "description": "g retriever is designed to solve the problem of textual graph understanding", + "source_ids": [ + 211 + ], + "source": "Name: g retriever\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: textual graph understanding\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "yucheng hu", + "tgt_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "relation_name": "", + "weight": 10.0, + "description": "yucheng hu is an author of the survey paper", + "source_ids": [ + 212 + ], + "source": "Name: yucheng hu\nType: PERSON", + "target": "Name: rag and rau a survey on retrieval augmented language model in natural language processing\nType: BOOK" + }, + { + "src_entity_name": "yuxing lu", + "tgt_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "relation_name": "", + "weight": 10.0, + "description": "yuxing lu is an author of the survey paper", + "source_ids": [ + 212 + ], + "source": "Name: yuxing lu\nType: PERSON", + "target": "Name: rag and rau a survey on retrieval augmented language model in natural language processing\nType: BOOK" + }, + { + "src_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "tgt_entity_name": "natural language processing", + "relation_name": "", + "weight": 8.0, + "description": "the survey paper focuses on the research field of natural language processing", + "source_ids": [ + 212 + ], + "source": "Name: rag and rau a survey on retrieval augmented language model in natural language processing\nType: BOOK", + "target": "Name: natural language processing\nType: RESEARCH_FIELD" + }, + { + "src_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "tgt_entity_name": "arxiv 2404 19543", + "relation_name": "", + "weight": 10.0, + "description": "the survey paper is identified by the preprint number arxiv 2404 19543", + "source_ids": [ + 212 + ], + "source": "Name: rag and rau a survey on retrieval augmented language model in natural language processing\nType: BOOK", + "target": "Name: arxiv 2404 19543\nType: PRODUCT" + }, + { + "src_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "tgt_entity_name": "retrieval augmented language model", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper is about the retrieval augmented language model technology", + "source_ids": [ + 212 + ], + "source": "Name: rag and rau a survey on retrieval augmented language model in natural language processing\nType: BOOK", + "target": "Name: retrieval augmented language model\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "soyeong jeong", + "tgt_entity_name": "adaptive rag", + "relation_name": "", + "weight": 9.0, + "description": "soyeong jeong is an author of the work on adaptive rag", + "source_ids": [ + 213 + ], + "source": "Name: soyeong jeong\nType: PERSON", + "target": "Name: adaptive rag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "jinheon baek", + "tgt_entity_name": "adaptive rag", + "relation_name": "", + "weight": 9.0, + "description": "jinheon baek is an author of the work on adaptive rag", + "source_ids": [ + 213 + ], + "source": "Name: jinheon baek\nType: PERSON", + "target": "Name: adaptive rag\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "adaptive rag", + "tgt_entity_name": "retrieval augmented large language models", + "relation_name": "", + "weight": 10.0, + "description": "adaptive rag is designed to adapt retrieval augmented large language models", + "source_ids": [ + 213 + ], + "source": "Name: adaptive rag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: retrieval augmented large language models\nType: MODEL_OR_ARCHITECTURE" + }, + { + "src_entity_name": "adaptive rag", + "tgt_entity_name": "question complexity", + "relation_name": "", + "weight": 10.0, + "description": "adaptive rag adapts specifically through the lens of question complexity", + "source_ids": [ + 213 + ], + "source": "Name: adaptive rag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: question complexity\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "adaptive rag", + "tgt_entity_name": "learning", + "relation_name": "", + "weight": 8.0, + "description": "adaptive rag utilizes learning to adapt its retrieval mechanisms", + "source_ids": [ + 213 + ], + "source": "Name: adaptive rag\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: learning\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "jane dwivedi yu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: timo schick\nType: PERSON", + "target": "Name: jane dwivedi yu\nType: PERSON" + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "roberto dess", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: timo schick\nType: PERSON", + "target": "Name: roberto dess\nType: PERSON" + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "roberta raileanu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: timo schick\nType: PERSON", + "target": "Name: roberta raileanu\nType: PERSON" + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "maria lomeli", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: timo schick\nType: PERSON", + "target": "Name: maria lomeli\nType: PERSON" + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "eric hambro", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: timo schick\nType: PERSON", + "target": "Name: eric hambro\nType: PERSON" + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: timo schick\nType: PERSON", + "target": "Name: luke zettlemoyer\nType: PERSON" + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: timo schick\nType: PERSON", + "target": "Name: nicola cancedda\nType: PERSON" + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: timo schick\nType: PERSON", + "target": "Name: thomas scialom\nType: PERSON" + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "roberto dess", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: jane dwivedi yu\nType: PERSON", + "target": "Name: roberto dess\nType: PERSON" + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "roberta raileanu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: jane dwivedi yu\nType: PERSON", + "target": "Name: roberta raileanu\nType: PERSON" + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "maria lomeli", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: jane dwivedi yu\nType: PERSON", + "target": "Name: maria lomeli\nType: PERSON" + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "eric hambro", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: jane dwivedi yu\nType: PERSON", + "target": "Name: eric hambro\nType: PERSON" + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: jane dwivedi yu\nType: PERSON", + "target": "Name: luke zettlemoyer\nType: PERSON" + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: jane dwivedi yu\nType: PERSON", + "target": "Name: nicola cancedda\nType: PERSON" + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: jane dwivedi yu\nType: PERSON", + "target": "Name: thomas scialom\nType: PERSON" + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "roberta raileanu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: roberto dess\nType: PERSON", + "target": "Name: roberta raileanu\nType: PERSON" + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "maria lomeli", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: roberto dess\nType: PERSON", + "target": "Name: maria lomeli\nType: PERSON" + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "eric hambro", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: roberto dess\nType: PERSON", + "target": "Name: eric hambro\nType: PERSON" + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: roberto dess\nType: PERSON", + "target": "Name: luke zettlemoyer\nType: PERSON" + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: roberto dess\nType: PERSON", + "target": "Name: nicola cancedda\nType: PERSON" + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: roberto dess\nType: PERSON", + "target": "Name: thomas scialom\nType: PERSON" + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "maria lomeli", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: roberta raileanu\nType: PERSON", + "target": "Name: maria lomeli\nType: PERSON" + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "eric hambro", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: roberta raileanu\nType: PERSON", + "target": "Name: eric hambro\nType: PERSON" + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: roberta raileanu\nType: PERSON", + "target": "Name: luke zettlemoyer\nType: PERSON" + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: roberta raileanu\nType: PERSON", + "target": "Name: nicola cancedda\nType: PERSON" + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: roberta raileanu\nType: PERSON", + "target": "Name: thomas scialom\nType: PERSON" + }, + { + "src_entity_name": "maria lomeli", + "tgt_entity_name": "eric hambro", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: maria lomeli\nType: PERSON", + "target": "Name: eric hambro\nType: PERSON" + }, + { + "src_entity_name": "maria lomeli", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: maria lomeli\nType: PERSON", + "target": "Name: luke zettlemoyer\nType: PERSON" + }, + { + "src_entity_name": "maria lomeli", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: maria lomeli\nType: PERSON", + "target": "Name: nicola cancedda\nType: PERSON" + }, + { + "src_entity_name": "maria lomeli", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: maria lomeli\nType: PERSON", + "target": "Name: thomas scialom\nType: PERSON" + }, + { + "src_entity_name": "eric hambro", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: eric hambro\nType: PERSON", + "target": "Name: luke zettlemoyer\nType: PERSON" + }, + { + "src_entity_name": "eric hambro", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: eric hambro\nType: PERSON", + "target": "Name: nicola cancedda\nType: PERSON" + }, + { + "src_entity_name": "eric hambro", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: eric hambro\nType: PERSON", + "target": "Name: thomas scialom\nType: PERSON" + }, + { + "src_entity_name": "luke zettlemoyer", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: luke zettlemoyer\nType: PERSON", + "target": "Name: nicola cancedda\nType: PERSON" + }, + { + "src_entity_name": "luke zettlemoyer", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: luke zettlemoyer\nType: PERSON", + "target": "Name: thomas scialom\nType: PERSON" + }, + { + "src_entity_name": "nicola cancedda", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ], + "source": "Name: nicola cancedda\nType: PERSON", + "target": "Name: thomas scialom\nType: PERSON" + }, + { + "src_entity_name": "metrics", + "tgt_entity_name": "main experiments", + "relation_name": "", + "weight": 10.0, + "description": "metrics are explicitly stated to be used in the main experiments", + "source_ids": [ + 222 + ], + "source": "Name: main experiments\nType: EVENT", + "target": "Name: metrics\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "definitions", + "tgt_entity_name": "metrics", + "relation_name": "", + "weight": 9.0, + "description": "definitions are provided for the metrics", + "source_ids": [ + 222 + ], + "source": "Name: metrics\nType: EVALUATION_METRIC", + "target": "Name: definitions\nType: CONCEPT" + }, + { + "src_entity_name": "calculation procedures", + "tgt_entity_name": "metrics", + "relation_name": "", + "weight": 9.0, + "description": "calculation procedures are provided for the metrics", + "source_ids": [ + 222 + ], + "source": "Name: metrics\nType: EVALUATION_METRIC", + "target": "Name: calculation procedures\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "definitions", + "tgt_entity_name": "calculation procedures", + "relation_name": "", + "weight": 8.0, + "description": "both definitions and calculation procedures are provided together for the metrics in the text", + "source_ids": [ + 222 + ], + "source": "Name: definitions\nType: CONCEPT", + "target": "Name: calculation procedures\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "standard rag models", + "tgt_entity_name": "natural language responses", + "relation_name": "", + "weight": 10.0, + "description": "standard rag models generate natural language responses as their output", + "source_ids": [ + 223 + ], + "source": "Name: standard rag models\nType: TECHNOLOGY", + "target": "Name: natural language responses\nType: PRODUCT" + }, + { + "src_entity_name": "a 1 1 answer extraction and normalization", + "tgt_entity_name": "standard rag models", + "relation_name": "", + "weight": 9.0, + "description": "the section a 1 1 answer extraction and normalization describes the behavior of standard rag models", + "source_ids": [ + 223 + ], + "source": "Name: standard rag models\nType: TECHNOLOGY", + "target": "Name: a 1 1 answer extraction and normalization\nType: SECTION_TITLE" + }, + { + "src_entity_name": "natural language responses", + "tgt_entity_name": "ground truth labels", + "relation_name": "", + "weight": 8.0, + "description": "natural language responses are compared against ground truth labels a process that can lead to false negatives if not normalized", + "source_ids": [ + 223 + ], + "source": "Name: natural language responses\nType: PRODUCT", + "target": "Name: ground truth labels\nType: PRODUCT" + }, + { + "src_entity_name": "natural language responses", + "tgt_entity_name": "the answer is", + "relation_name": "", + "weight": 10.0, + "description": "the answer is is cited as an example of the extraneous conversational text found in natural language responses", + "source_ids": [ + 223 + ], + "source": "Name: natural language responses\nType: PRODUCT", + "target": "Name: the answer is\nType: PRODUCT" + }, + { + "src_entity_name": "a 1 1 answer extraction and normalization", + "tgt_entity_name": "ground truth labels", + "relation_name": "", + "weight": 9.0, + "description": "the section a 1 1 answer extraction and normalization discusses the comparison with ground truth labels", + "source_ids": [ + 223 + ], + "source": "Name: ground truth labels\nType: PRODUCT", + "target": "Name: a 1 1 answer extraction and normalization\nType: SECTION_TITLE" + }, + { + "src_entity_name": "ground truth labels", + "tgt_entity_name": "option a", + "relation_name": "", + "weight": 10.0, + "description": "option a is cited as an example of a ground truth label", + "source_ids": [ + 223 + ], + "source": "Name: ground truth labels\nType: PRODUCT", + "target": "Name: option a\nType: PRODUCT" + }, + { + "src_entity_name": "ground truth labels", + "tgt_entity_name": "12 5", + "relation_name": "", + "weight": 10.0, + "description": "12 5 is cited as an example of a ground truth label", + "source_ids": [ + 223 + ], + "source": "Name: ground truth labels\nType: PRODUCT", + "target": "Name: 12 5\nType: MEASUREMENT" + }, + { + "src_entity_name": "llm based extraction step", + "tgt_entity_name": "rag system", + "relation_name": "", + "weight": 9.0, + "description": "the llm based extraction step is employed to process the output from the rag system", + "source_ids": [ + 224 + ], + "source": "Name: llm based extraction step\nType: METHOD_OR_TECHNIQUE", + "target": "Name: rag system\nType: SYSTEM" + }, + { + "src_entity_name": "official evaluation protocols", + "tgt_entity_name": "llm based extraction step", + "relation_name": "", + "weight": 9.0, + "description": "the llm based extraction step is employed following official evaluation protocols", + "source_ids": [ + 224 + ], + "source": "Name: llm based extraction step\nType: METHOD_OR_TECHNIQUE", + "target": "Name: official evaluation protocols\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "y raw", + "relation_name": "", + "weight": 10.0, + "description": "llmextract extracts key information from y raw", + "source_ids": [ + 224 + ], + "source": "Name: llmextract\nType: SOFTWARE", + "target": "Name: y raw\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "y hat", + "relation_name": "", + "weight": 10.0, + "description": "llmextract is used to define the extracted answer y hat", + "source_ids": [ + 224 + ], + "source": "Name: llmextract\nType: SOFTWARE", + "target": "Name: y hat\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "equation 16", + "tgt_entity_name": "llmextract", + "relation_name": "", + "weight": 9.0, + "description": "equation 16 utilizes llmextract to define the extracted answer", + "source_ids": [ + 224 + ], + "source": "Name: llmextract\nType: SOFTWARE", + "target": "Name: equation 16\nType: EQUATION_OR_FORMULA" + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "key information", + "relation_name": "", + "weight": 10.0, + "description": "llmextract is responsible for extracting key information from the raw response", + "source_ids": [ + 224 + ], + "source": "Name: llmextract\nType: SOFTWARE", + "target": "Name: key information\nType: CONCEPT" + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "key entity", + "relation_name": "", + "weight": 8.0, + "description": "key entity is a specific type of key information extracted by llmextract", + "source_ids": [ + 224 + ], + "source": "Name: llmextract\nType: SOFTWARE", + "target": "Name: key entity\nType: CONCEPT" + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "span extraction", + "relation_name": "", + "weight": 8.0, + "description": "span extraction is the context in which llmextract extracts key entities", + "source_ids": [ + 224 + ], + "source": "Name: llmextract\nType: SOFTWARE", + "target": "Name: span extraction\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "instruction", + "relation_name": "", + "weight": 10.0, + "description": "llmextract uses the instruction parameter to perform the extraction", + "source_ids": [ + 224 + ], + "source": "Name: llmextract\nType: SOFTWARE", + "target": "Name: instruction\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "y raw", + "tgt_entity_name": "y gold", + "relation_name": "", + "weight": 8.0, + "description": "y raw and y gold are compared after normalization to calculate the evaluation metric", + "source_ids": [ + 224 + ], + "source": "Name: y raw\nType: PARAMETER_OR_VARIABLE", + "target": "Name: y gold\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "n", + "tgt_entity_name": "y gold", + "relation_name": "", + "weight": 9.0, + "description": "n is applied to normalize y gold", + "source_ids": [ + 224 + ], + "source": "Name: y gold\nType: PARAMETER_OR_VARIABLE", + "target": "Name: n\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "n", + "tgt_entity_name": "y hat", + "relation_name": "", + "weight": 9.0, + "description": "n is applied to normalize y hat", + "source_ids": [ + 224 + ], + "source": "Name: y hat\nType: PARAMETER_OR_VARIABLE", + "target": "Name: n\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "n", + "tgt_entity_name": "lowercasing", + "relation_name": "", + "weight": 9.0, + "description": "lowercasing is an example of the standard normalization n applied to the data", + "source_ids": [ + 224 + ], + "source": "Name: n\nType: METHOD_OR_TECHNIQUE", + "target": "Name: lowercasing\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "n", + "tgt_entity_name": "removing punctuation", + "relation_name": "", + "weight": 9.0, + "description": "removing punctuation is an example of the standard normalization n applied to the data", + "source_ids": [ + 224 + ], + "source": "Name: n\nType: METHOD_OR_TECHNIQUE", + "target": "Name: removing punctuation\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "qa performance metrics", + "tgt_entity_name": "a.1.2 qa performance metrics", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'QA Performance Metrics' is the primary topic covered in section A.1.2.", + "source_ids": [ + 226 + ], + "source": "Name: a.1.2 qa performance metrics\nType: SECTION_TITLE", + "target": "Name: qa performance metrics\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "ground truth (y_gold)", + "tgt_entity_name": "a.1.2 qa performance metrics", + "relation_name": "", + "weight": 9.5, + "description": "The variable 'Ground Truth' is a fundamental component used in the definitions provided in section A.1.2.", + "source_ids": [ + 226 + ], + "source": "Name: a.1.2 qa performance metrics\nType: SECTION_TITLE", + "target": "Name: ground truth (y_gold)\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "model response (y_raw)", + "tgt_entity_name": "a.1.2 qa performance metrics", + "relation_name": "", + "weight": 9.5, + "description": "The variable 'Model Response' is a fundamental component used in the definitions provided in section A.1.2.", + "source_ids": [ + 226 + ], + "source": "Name: a.1.2 qa performance metrics\nType: SECTION_TITLE", + "target": "Name: model response (y_raw)\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "substring inclusion relation", + "tgt_entity_name": "a.1.2 qa performance metrics", + "relation_name": "", + "weight": 9.0, + "description": "The technique 'Substring Inclusion Relation' is the core logic applied in section A.1.2 to compute the metrics.", + "source_ids": [ + 226 + ], + "source": "Name: a.1.2 qa performance metrics\nType: SECTION_TITLE", + "target": "Name: substring inclusion relation\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "accuracy inclusion based", + "tgt_entity_name": "prior works", + "relation_name": "", + "weight": 9.0, + "description": "accuracy inclusion based is utilized following prior works cited as 3 34 46", + "source_ids": [ + 227 + ], + "source": "Name: accuracy inclusion based\nType: EVALUATION_METRIC", + "target": "Name: prior works\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "r", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 10.0, + "description": "r recall is a component used in the calculation of the f1 score", + "source_ids": [ + 231 + ], + "source": "Name: r\nType: PARAMETER_OR_VARIABLE", + "target": "Name: f1\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "retrieval quality", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 9.5, + "description": "Retrieval Quality is the primary concept evaluated within section A.1.3.", + "source_ids": [ + 234 + ], + "source": "Name: a.1.3 retrieval recall\nType: SECTION_TITLE", + "target": "Name: retrieval quality\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "pdf blocks", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "PDF Blocks serve as the granular units of analysis for the evaluation described in section A.1.3.", + "source_ids": [ + 234 + ], + "source": "Name: a.1.3 retrieval recall\nType: SECTION_TITLE", + "target": "Name: pdf blocks\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "query q", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 8.5, + "description": "The variable 'Query q' is a key parameter defined in the context of section A.1.3.", + "source_ids": [ + 234 + ], + "source": "Name: a.1.3 retrieval recall\nType: SECTION_TITLE", + "target": "Name: query q\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "b_gold", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "The variable 'B_gold' represents the ground truth set utilized in the definition provided in section A.1.3.", + "source_ids": [ + 234 + ], + "source": "Name: a.1.3 retrieval recall\nType: SECTION_TITLE", + "target": "Name: b_gold\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "b_ret", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "The variable 'B_ret' represents the retrieved set utilized in the definition provided in section A.1.3.", + "source_ids": [ + 234 + ], + "source": "Name: a.1.3 retrieval recall\nType: SECTION_TITLE", + "target": "Name: b_ret\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "recall_ret", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 10.0, + "description": "The metric 'Recall_ret' is the central formula and subject explicitly defined in section A.1.3.", + "source_ids": [ + 234 + ], + "source": "Name: a.1.3 retrieval recall\nType: SECTION_TITLE", + "target": "Name: recall_ret\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "ground truth block", + "tgt_entity_name": "candidate pool", + "relation_name": "", + "weight": 9.0, + "description": "a ground truth block is considered unretrievable if it does not exist in the candidate pool", + "source_ids": [ + 236 + ], + "source": "Name: ground truth block\nType: TASK_OR_PROBLEM", + "target": "Name: candidate pool\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "ground truth block", + "tgt_entity_name": "recall", + "relation_name": "", + "weight": 10.0, + "description": "the loss of a ground truth block results in a recall contribution of 0", + "source_ids": [ + 236 + ], + "source": "Name: ground truth block\nType: TASK_OR_PROBLEM", + "target": "Name: recall\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "recall", + "tgt_entity_name": "0", + "relation_name": "", + "weight": 10.0, + "description": "the recall contribution is explicitly stated as 0 when a ground truth block is lost", + "source_ids": [ + 236 + ], + "source": "Name: recall\nType: EVALUATION_METRIC", + "target": "Name: 0\nType: NUMBER" + }, + { + "src_entity_name": "qwen3 8b", + "tgt_entity_name": "ground truth images", + "relation_name": "", + "weight": 7.0, + "description": "the 8b counterpart related to qwen3 8b context failed to answer correctly even with ground truth images", + "source_ids": [ + 238 + ], + "source": "Name: qwen3 8b\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: ground truth images\nType: IMAGE" + }, + { + "src_entity_name": "qwen3 8b", + "tgt_entity_name": "reference 60", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 8b is cited in reference 60", + "source_ids": [ + 238 + ], + "source": "Name: qwen3 8b\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: reference 60\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "qwen2 5vl 30b", + "tgt_entity_name": "reference 4", + "relation_name": "", + "weight": 10.0, + "description": "qwen2 5vl 30b is cited in reference 4", + "source_ids": [ + 238 + ], + "source": "Name: qwen2 5vl 30b\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: reference 4\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "qwen3 embedding 0 6b", + "tgt_entity_name": "text embedding", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 embedding 0 6b is used for text embedding", + "source_ids": [ + 238 + ], + "source": "Name: qwen3 embedding 0 6b\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: text embedding\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "qwen3 embedding 0 6b", + "tgt_entity_name": "reference 64", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 embedding 0 6b is cited in reference 64", + "source_ids": [ + 238 + ], + "source": "Name: qwen3 embedding 0 6b\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: reference 64\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "gme qwen2 vl 2b instruct", + "tgt_entity_name": "multi modal embedding", + "relation_name": "", + "weight": 10.0, + "description": "gme qwen2 vl 2b instruct is used for multi modal embedding", + "source_ids": [ + 238 + ], + "source": "Name: gme qwen2 vl 2b instruct\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: multi modal embedding\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "gme qwen2 vl 2b instruct", + "tgt_entity_name": "reference 63", + "relation_name": "", + "weight": 10.0, + "description": "gme qwen2 vl 2b instruct is cited in reference 63", + "source_ids": [ + 238 + ], + "source": "Name: gme qwen2 vl 2b instruct\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: reference 63\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "qwen3 reranker 4b", + "tgt_entity_name": "reranking", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 reranker 4b is used for reranking", + "source_ids": [ + 238 + ], + "source": "Name: qwen3 reranker 4b\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: reranking\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "qwen3 reranker 4b", + "tgt_entity_name": "reference 64", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 reranker 4b is cited in reference 64", + "source_ids": [ + 238 + ], + "source": "Name: qwen3 reranker 4b\nType: MODEL_OR_ARCHITECTURE", + "target": "Name: reference 64\nType: PUBLICATION_VENUE" + }, + { + "src_entity_name": "linux", + "tgt_entity_name": "intel xeon 2 0ghz cpu", + "relation_name": "", + "weight": 9.0, + "description": "the linux operating system runs on a server equipped with an intel xeon 2 0ghz cpu", + "source_ids": [ + 238 + ], + "source": "Name: linux\nType: SOFTWARE", + "target": "Name: intel xeon 2 0ghz cpu\nType: HARDWARE" + }, + { + "src_entity_name": "linux", + "tgt_entity_name": "nvidia geforce rtx a5000", + "relation_name": "", + "weight": 9.0, + "description": "the linux operating system runs on a server equipped with nvidia geforce rtx a5000 gpus", + "source_ids": [ + 238 + ], + "source": "Name: linux\nType: SOFTWARE", + "target": "Name: nvidia geforce rtx a5000\nType: HARDWARE" + }, + { + "src_entity_name": "linux", + "tgt_entity_name": "high performance server", + "relation_name": "", + "weight": 9.0, + "description": "the linux operating system runs on the high performance server", + "source_ids": [ + 238 + ], + "source": "Name: linux\nType: SOFTWARE", + "target": "Name: high performance server\nType: LOCATION" + }, + { + "src_entity_name": "intel xeon 2 0ghz cpu", + "tgt_entity_name": "1024gb", + "relation_name": "", + "weight": 8.0, + "description": "the server with the intel xeon 2 0ghz cpu has 1024gb of memory", + "source_ids": [ + 238 + ], + "source": "Name: intel xeon 2 0ghz cpu\nType: HARDWARE", + "target": "Name: 1024gb\nType: MEASUREMENT" + }, + { + "src_entity_name": "nvidia geforce rtx a5000", + "tgt_entity_name": "24 gb", + "relation_name": "", + "weight": 9.0, + "description": "each nvidia geforce rtx a5000 gpu has 24 gb of vram", + "source_ids": [ + 238 + ], + "source": "Name: nvidia geforce rtx a5000\nType: HARDWARE", + "target": "Name: 24 gb\nType: MEASUREMENT" + }, + { + "src_entity_name": "8b counterpart", + "tgt_entity_name": "performance deficits", + "relation_name": "", + "weight": 10.0, + "description": "the 8b counterpart exhibited performance deficits", + "source_ids": [ + 238 + ], + "source": "Name: 8b counterpart\nType: MEASUREMENT", + "target": "Name: performance deficits\nType: CONCEPT" + }, + { + "src_entity_name": "prompts", + "tgt_entity_name": "a.3 prompts", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Prompts' is the primary topic of section A.3.", + "source_ids": [ + 239 + ], + "source": "Name: a.3 prompts\nType: SECTION_TITLE", + "target": "Name: prompts\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "figure 10", + "tgt_entity_name": "agent based query classification", + "relation_name": "", + "weight": 10.0, + "description": "figure 10 presents the prompts for agent based query classification", + "source_ids": [ + 240 + ], + "source": "Name: agent based query classification\nType: TASK_OR_PROBLEM", + "target": "Name: figure 10\nType: IMAGE" + }, + { + "src_entity_name": "prompts", + "tgt_entity_name": "agent based query classification", + "relation_name": "", + "weight": 10.0, + "description": "prompts are designed specifically for agent based query classification", + "source_ids": [ + 240 + ], + "source": "Name: agent based query classification\nType: TASK_OR_PROBLEM", + "target": "Name: prompts\nType: PRODUCT" + }, + { + "src_entity_name": "figure 11", + "tgt_entity_name": "question decomposition", + "relation_name": "", + "weight": 10.0, + "description": "figure 11 presents the prompts for question decomposition", + "source_ids": [ + 240 + ], + "source": "Name: question decomposition\nType: TASK_OR_PROBLEM", + "target": "Name: figure 11\nType: IMAGE" + }, + { + "src_entity_name": "prompts", + "tgt_entity_name": "question decomposition", + "relation_name": "", + "weight": 10.0, + "description": "prompts are designed specifically for question decomposition", + "source_ids": [ + 240 + ], + "source": "Name: question decomposition\nType: TASK_OR_PROBLEM", + "target": "Name: prompts\nType: PRODUCT" + }, + { + "src_entity_name": "figure 12", + "tgt_entity_name": "filter operator generation", + "relation_name": "", + "weight": 10.0, + "description": "figure 12 contains the prompt used for filter operator generation", + "source_ids": [ + 259 + ], + "source": "Name: filter operator generation\nType: TASK_OR_PROBLEM", + "target": "Name: figure 12\nType: IMAGE" + }, + { + "src_entity_name": "prompts", + "tgt_entity_name": "filter operator generation", + "relation_name": "", + "weight": 10.0, + "description": "prompts are designed specifically for filter operator generation", + "source_ids": [ + 240 + ], + "source": "Name: filter operator generation\nType: TASK_OR_PROBLEM", + "target": "Name: prompts\nType: PRODUCT" + }, + { + "src_entity_name": "figure 13", + "tgt_entity_name": "entity resolution judgment", + "relation_name": "", + "weight": 10.0, + "description": "figure 13 illustrates the prompt for entity resolution judgment", + "source_ids": [ + 240 + ], + "source": "Name: entity resolution judgment\nType: TASK_OR_PROBLEM", + "target": "Name: figure 13\nType: IMAGE" + }, + { + "src_entity_name": "entity resolution judgment", + "tgt_entity_name": "graph construction phase", + "relation_name": "", + "weight": 9.0, + "description": "entity resolution judgment is performed during the graph construction phase", + "source_ids": [ + 240 + ], + "source": "Name: entity resolution judgment\nType: TASK_OR_PROBLEM", + "target": "Name: graph construction phase\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "prompts", + "tgt_entity_name": "entity resolution judgment", + "relation_name": "", + "weight": 10.0, + "description": "a prompt is employed for entity resolution judgment", + "source_ids": [ + 240 + ], + "source": "Name: entity resolution judgment\nType: TASK_OR_PROBLEM", + "target": "Name: prompts\nType: PRODUCT" + }, + { + "src_entity_name": "figure 11", + "tgt_entity_name": "query decomposition", + "relation_name": "", + "weight": 10.0, + "description": "figure 11 contains the prompt specifically for the task of query decomposition", + "source_ids": [ + 256 + ], + "source": "Name: figure 11\nType: IMAGE", + "target": "Name: query decomposition\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "examples", + "tgt_entity_name": "figure 13", + "relation_name": "", + "weight": 8.0, + "description": "examples were omitted from figure 13 due to lack of space", + "source_ids": [ + 284 + ], + "source": "Name: figure 13\nType: IMAGE", + "target": "Name: examples\nType: DATASET_OR_CORPUS" + }, + { + "src_entity_name": "expert query analyzer", + "tgt_entity_name": "simple", + "relation_name": "", + "weight": 9.0, + "description": "the expert query analyzer classifies questions into the simple category", + "source_ids": [ + 241 + ], + "source": "Name: expert query analyzer\nType: PERSON", + "target": "Name: simple\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "expert query analyzer", + "tgt_entity_name": "user", + "relation_name": "", + "weight": 8.0, + "description": "the expert query analyzer processes questions submitted by the user", + "source_ids": [ + 241 + ], + "source": "Name: expert query analyzer\nType: PERSON", + "target": "Name: user\nType: PERSON" + }, + { + "src_entity_name": "expert query analyzer", + "tgt_entity_name": "json object", + "relation_name": "", + "weight": 9.0, + "description": "the expert query analyzer must respond using the specified json object format", + "source_ids": [ + 241 + ], + "source": "Name: expert query analyzer\nType: PERSON", + "target": "Name: json object\nType: FILE_TYPE" + }, + { + "src_entity_name": "assistant", + "tgt_entity_name": "user", + "relation_name": "", + "weight": 10.0, + "description": "the assistant responds to the user", + "source_ids": [ + 258 + ], + "source": "Name: user\nType: PERSON", + "target": "Name: assistant\nType: PERSON" + }, + { + "src_entity_name": "ai assistant", + "tgt_entity_name": "json object", + "relation_name": "", + "weight": 10.0, + "description": "the ai assistant must return a single valid json object as its output", + "source_ids": [ + 258 + ], + "source": "Name: json object\nType: FILE_TYPE", + "target": "Name: ai assistant\nType: PERSON" + }, + { + "src_entity_name": "assistant", + "tgt_entity_name": "json object", + "relation_name": "", + "weight": 10.0, + "description": "the assistant must output a json object", + "source_ids": [ + 258 + ], + "source": "Name: json object\nType: FILE_TYPE", + "target": "Name: assistant\nType: PERSON" + }, + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "json object", + "relation_name": "", + "weight": 9.0, + "description": "the entity resolution adjudicator must output the result in a json object format", + "source_ids": [ + 262 + ], + "source": "Name: json object\nType: FILE_TYPE", + "target": "Name: entity resolution adjudicator\nType: PERSON" + }, + { + "src_entity_name": "json object", + "tgt_entity_name": "id", + "relation_name": "", + "weight": 8.0, + "description": "the json object contains the id of the matching candidate", + "source_ids": [ + 262 + ], + "source": "Name: json object\nType: FILE_TYPE", + "target": "Name: id\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "json object", + "tgt_entity_name": "explanation", + "relation_name": "", + "weight": 8.0, + "description": "the json object contains the explanation for the decision", + "source_ids": [ + 262 + ], + "source": "Name: json object\nType: FILE_TYPE", + "target": "Name: explanation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "information", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "information is retrieved from the document", + "source_ids": [ + 243 + ], + "source": "Name: information\nType: CONCEPT", + "target": "Name: document\nType: CONCEPT" + }, + { + "src_entity_name": "paragraph", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "a paragraph is a part of a document", + "source_ids": [ + 243 + ], + "source": "Name: document\nType: CONCEPT", + "target": "Name: paragraph\nType: SECTION_TITLE" + }, + { + "src_entity_name": "table", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "a table is a part of a document", + "source_ids": [ + 243 + ], + "source": "Name: document\nType: CONCEPT", + "target": "Name: table\nType: SECTION_TITLE" + }, + { + "src_entity_name": "figure", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "a figure is a part of a document", + "source_ids": [ + 243 + ], + "source": "Name: document\nType: CONCEPT", + "target": "Name: figure\nType: SECTION_TITLE" + }, + { + "src_entity_name": "5", + "tgt_entity_name": "latinos", + "relation_name": "", + "weight": 9.0, + "description": "the percentage 5 specifically refers to a subset of the latino population", + "source_ids": [ + 246 + ], + "source": "Name: 5\nType: PERCENTAGE", + "target": "Name: latinos\nType: NATIONALITY" + }, + { + "src_entity_name": "latinos", + "tgt_entity_name": "economic upward mobility", + "relation_name": "", + "weight": 10.0, + "description": "latinos are the group whose perspective on economic upward mobility for their children is being examined", + "source_ids": [ + 246 + ], + "source": "Name: latinos\nType: NATIONALITY", + "target": "Name: economic upward mobility\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "latinos", + "tgt_entity_name": "children", + "relation_name": "", + "weight": 10.0, + "description": "the children belong to the latino demographic group mentioned in the text", + "source_ids": [ + 246 + ], + "source": "Name: latinos\nType: NATIONALITY", + "target": "Name: children\nType: PERSON" + }, + { + "src_entity_name": "economic upward mobility", + "tgt_entity_name": "children", + "relation_name": "", + "weight": 9.0, + "description": "economic upward mobility is the specific attribute or outcome being considered for the children", + "source_ids": [ + 246 + ], + "source": "Name: economic upward mobility\nType: TASK_OR_PROBLEM", + "target": "Name: children\nType: PERSON" + }, + { + "src_entity_name": "counting", + "tgt_entity_name": "aggregation operation", + "relation_name": "", + "weight": 10.0, + "description": "counting is explicitly described as an aggregation operation", + "source_ids": [ + 250 + ], + "source": "Name: counting\nType: METHOD_OR_TECHNIQUE", + "target": "Name: aggregation operation\nType: UNKNOWN" + }, + { + "src_entity_name": "listing", + "tgt_entity_name": "aggregation operation", + "relation_name": "", + "weight": 10.0, + "description": "listing is explicitly described as an aggregation operation", + "source_ids": [ + 250 + ], + "source": "Name: listing\nType: METHOD_OR_TECHNIQUE", + "target": "Name: aggregation operation\nType: UNKNOWN" + }, + { + "src_entity_name": "summarizing", + "tgt_entity_name": "aggregation operation", + "relation_name": "", + "weight": 10.0, + "description": "summarizing is explicitly described as an aggregation operation", + "source_ids": [ + 250 + ], + "source": "Name: summarizing\nType: METHOD_OR_TECHNIQUE", + "target": "Name: aggregation operation\nType: UNKNOWN" + }, + { + "src_entity_name": "structural filter", + "tgt_entity_name": "items", + "relation_name": "", + "weight": 8.0, + "description": "the structural filter is used to identify the set of items", + "source_ids": [ + 250 + ], + "source": "Name: structural filter\nType: METHOD_OR_TECHNIQUE", + "target": "Name: items\nType: UNKNOWN" + }, + { + "src_entity_name": "user a2gbifl43u1lkj", + "tgt_entity_name": "soft labeled personality embedding matrix", + "relation_name": "", + "weight": 9.0, + "description": "user a2gbifl43u1lkj is the subject for whom personality vectors are analyzed within the soft labeled personality embedding matrix", + "source_ids": [ + 255 + ], + "source": "Name: user a2gbifl43u1lkj\nType: PERSON", + "target": "Name: soft labeled personality embedding matrix\nType: PRODUCT" + }, + { + "src_entity_name": "user a2gbifl43u1lkj", + "tgt_entity_name": "receptiviti score", + "relation_name": "", + "weight": 9.0, + "description": "receptiviti scores are calculated for the personality vectors associated with user a2gbifl43u1lkj", + "source_ids": [ + 255 + ], + "source": "Name: user a2gbifl43u1lkj\nType: PERSON", + "target": "Name: receptiviti score\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "foreign born latinos", + "tgt_entity_name": "population", + "relation_name": "", + "weight": 8.0, + "description": "the population of foreign born latinos is a specific value sought in the survey example", + "source_ids": [ + 255 + ], + "source": "Name: foreign born latinos\nType: PERSON", + "target": "Name: population\nType: MEASUREMENT" + }, + { + "src_entity_name": "latinos interviewed by cellphone", + "tgt_entity_name": "population", + "relation_name": "", + "weight": 8.0, + "description": "the population of latinos interviewed by cellphone is a specific value sought in the survey example", + "source_ids": [ + 255 + ], + "source": "Name: latinos interviewed by cellphone\nType: PERSON", + "target": "Name: population\nType: MEASUREMENT" + }, + { + "src_entity_name": "soft labeled personality embedding matrix", + "tgt_entity_name": "receptiviti score", + "relation_name": "", + "weight": 7.0, + "description": "the soft labeled personality embedding matrix contains personality vectors that are evaluated using receptiviti scores", + "source_ids": [ + 255 + ], + "source": "Name: soft labeled personality embedding matrix\nType: PRODUCT", + "target": "Name: receptiviti score\nType: EVALUATION_METRIC" + }, + { + "src_entity_name": "report", + "tgt_entity_name": "chapter", + "relation_name": "", + "weight": 6.0, + "description": "the example query asks to count chapters in the report", + "source_ids": [ + 258 + ], + "source": "Name: report\nType: BOOK", + "target": "Name: chapter\nType: SECTION_TITLE" + }, + { + "src_entity_name": "ai assistant", + "tgt_entity_name": "global query", + "relation_name": "", + "weight": 10.0, + "description": "the ai assistant s function is to analyze the global query", + "source_ids": [ + 258 + ], + "source": "Name: ai assistant\nType: PERSON", + "target": "Name: global query\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "ai assistant", + "tgt_entity_name": "filters", + "relation_name": "", + "weight": 9.0, + "description": "the ai assistant must determine the list of filters to apply", + "source_ids": [ + 258 + ], + "source": "Name: ai assistant\nType: PERSON", + "target": "Name: filters\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "ai assistant", + "tgt_entity_name": "operation", + "relation_name": "", + "weight": 9.0, + "description": "the ai assistant must determine the final aggregation operation", + "source_ids": [ + 258 + ], + "source": "Name: ai assistant\nType: PERSON", + "target": "Name: operation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "filters", + "tgt_entity_name": "page", + "relation_name": "", + "weight": 8.0, + "description": "filters can be of type page to target specific page numbers", + "source_ids": [ + 258 + ], + "source": "Name: filters\nType: TASK_OR_PROBLEM", + "target": "Name: page\nType: MEASUREMENT" + }, + { + "src_entity_name": "assistant", + "tgt_entity_name": "filters", + "relation_name": "", + "weight": 9.0, + "description": "the assistant determines the filters to apply", + "source_ids": [ + 258 + ], + "source": "Name: filters\nType: TASK_OR_PROBLEM", + "target": "Name: assistant\nType: PERSON" + }, + { + "src_entity_name": "operation", + "tgt_entity_name": "count", + "relation_name": "", + "weight": 7.0, + "description": "count is one of the possible operations for aggregation", + "source_ids": [ + 258 + ], + "source": "Name: operation\nType: TASK_OR_PROBLEM", + "target": "Name: count\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "operation", + "tgt_entity_name": "list", + "relation_name": "", + "weight": 7.0, + "description": "list is one of the possible operations for aggregation", + "source_ids": [ + 258 + ], + "source": "Name: operation\nType: TASK_OR_PROBLEM", + "target": "Name: list\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "operation", + "tgt_entity_name": "summarize", + "relation_name": "", + "weight": 7.0, + "description": "summarize is one of the possible operations for aggregation", + "source_ids": [ + 258 + ], + "source": "Name: operation\nType: TASK_OR_PROBLEM", + "target": "Name: summarize\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "operation", + "tgt_entity_name": "analyze", + "relation_name": "", + "weight": 7.0, + "description": "analyze is one of the possible operations for aggregation", + "source_ids": [ + 258 + ], + "source": "Name: operation\nType: TASK_OR_PROBLEM", + "target": "Name: analyze\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "assistant", + "tgt_entity_name": "operation", + "relation_name": "", + "weight": 9.0, + "description": "the assistant determines the operation to perform", + "source_ids": [ + 258 + ], + "source": "Name: operation\nType: TASK_OR_PROBLEM", + "target": "Name: assistant\nType: PERSON" + }, + { + "src_entity_name": "methodology", + "tgt_entity_name": "data augmentation", + "relation_name": "", + "weight": 9.0, + "description": "the example query asks to summarize the discussion about data augmentation in the methodology section", + "source_ids": [ + 258 + ], + "source": "Name: methodology\nType: SECTION_TITLE", + "target": "Name: data augmentation\nType: METHOD_OR_TECHNIQUE" + }, + { + "src_entity_name": "paper", + "tgt_entity_name": "page", + "relation_name": "", + "weight": 6.0, + "description": "the example query specifies a page range 3 to 10 for the paper", + "source_ids": [ + 258 + ], + "source": "Name: paper\nType: BOOK", + "target": "Name: page\nType: MEASUREMENT" + }, + { + "src_entity_name": "discussion", + "tgt_entity_name": "data augmentation", + "relation_name": "", + "weight": 8.0, + "description": "the discussion is about data augmentation", + "source_ids": [ + 258 + ], + "source": "Name: data augmentation\nType: METHOD_OR_TECHNIQUE", + "target": "Name: discussion\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "page", + "tgt_entity_name": "3 10", + "relation_name": "", + "weight": 8.0, + "description": "3 10 is an example value for a page filter", + "source_ids": [ + 258 + ], + "source": "Name: 3 10\nType: MEASUREMENT", + "target": "Name: page\nType: MEASUREMENT" + }, + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "candidate entities", + "relation_name": "", + "weight": 10.0, + "description": "the entity resolution adjudicator compares the new entity against the candidate entities", + "source_ids": [ + 262 + ], + "source": "Name: entity resolution adjudicator\nType: PERSON", + "target": "Name: candidate entities\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "id", + "relation_name": "", + "weight": 9.0, + "description": "the entity resolution adjudicator outputs the id of the matching candidate", + "source_ids": [ + 262 + ], + "source": "Name: entity resolution adjudicator\nType: PERSON", + "target": "Name: id\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 9.0, + "description": "the entity resolution adjudicator outputs 1 if no match is found", + "source_ids": [ + 262 + ], + "source": "Name: entity resolution adjudicator\nType: PERSON", + "target": "Name: 1\nType: VALUE" + }, + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "explanation", + "relation_name": "", + "weight": 9.0, + "description": "the entity resolution adjudicator provides an explanation for the decision", + "source_ids": [ + 262 + ], + "source": "Name: entity resolution adjudicator\nType: PERSON", + "target": "Name: explanation\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "candidate entities", + "tgt_entity_name": "knowledge graph", + "relation_name": "", + "weight": 8.0, + "description": "candidate entities are retrieved from the knowledge graph", + "source_ids": [ + 262 + ], + "source": "Name: candidate entities\nType: TASK_OR_PROBLEM", + "target": "Name: knowledge graph\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "candidate entities", + "tgt_entity_name": "knowledge base", + "relation_name": "", + "weight": 8.0, + "description": "candidate entities are retrieved from the knowledge base", + "source_ids": [ + 262 + ], + "source": "Name: candidate entities\nType: TASK_OR_PROBLEM", + "target": "Name: knowledge base\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "candidate entities", + "tgt_entity_name": "id", + "relation_name": "", + "weight": 10.0, + "description": "each candidate entity has a unique id for reference", + "source_ids": [ + 262 + ], + "source": "Name: candidate entities\nType: TASK_OR_PROBLEM", + "target": "Name: id\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "select id", + "tgt_entity_name": "id", + "relation_name": "", + "weight": 9.0, + "description": "select id is defined as the integer value of the id of the candidate", + "source_ids": [ + 276 + ], + "source": "Name: id\nType: PARAMETER_OR_VARIABLE", + "target": "Name: select id\nType: PARAMETER_OR_VARIABLE" + }, + { + "src_entity_name": "event detection", + "tgt_entity_name": "named entity recognition", + "relation_name": "", + "weight": 9.0, + "description": "event detection and named entity recognition are distinct parallel concepts and are explicitly stated as not a match", + "source_ids": [ + 267 + ], + "source": "Name: event detection\nType: TASK_OR_PROBLEM", + "target": "Name: named entity recognition\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "description", + "tgt_entity_name": "contextual importance", + "relation_name": "", + "weight": 9.0, + "description": "descriptions possess contextual importance which dictates the need to analyze them for underlying identity rather than surface similarity", + "source_ids": [ + 269 + ], + "source": "Name: description\nType: CONCEPT", + "target": "Name: contextual importance\nType: CONCEPT" + }, + { + "src_entity_name": "apple", + "tgt_entity_name": "apple inc", + "relation_name": "", + "weight": 10.0, + "description": "both are mentioned in the text as examples to illustrate that they are not a match despite sharing the same name", + "source_ids": [ + 272 + ], + "source": "Name: apple\nType: PRODUCT", + "target": "Name: apple inc\nType: ORGANIZATION" + }, + { + "src_entity_name": "when in doubt", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 10.0, + "description": "the text states that if the condition when in doubt is met the output must be 1", + "source_ids": [ + 273 + ], + "source": "Name: when in doubt\nType: TASK_OR_PROBLEM", + "target": "Name: 1\nType: UNKNOWN" + }, + { + "src_entity_name": "json", + "tgt_entity_name": "output", + "relation_name": "", + "weight": 10.0, + "description": "the text specifies that the answer must be provided in a valid json format", + "source_ids": [ + 275 + ], + "source": "Name: json\nType: FILE_TYPE", + "target": "Name: output\nType: UNKNOWN" + }, + { + "src_entity_name": "select id", + "tgt_entity_name": "exact match", + "relation_name": "", + "weight": 8.0, + "description": "select id holds the value of the id if an exact match is found", + "source_ids": [ + 276 + ], + "source": "Name: select id\nType: PARAMETER_OR_VARIABLE", + "target": "Name: exact match\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "select id", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 9.0, + "description": "select id is assigned the value 1 if no exact match is found", + "source_ids": [ + 276 + ], + "source": "Name: select id\nType: PARAMETER_OR_VARIABLE", + "target": "Name: 1\nType: MONEY" + }, + { + "src_entity_name": "select id", + "tgt_entity_name": "candidate", + "relation_name": "", + "weight": 9.0, + "description": "select id represents the id of the candidate being evaluated", + "source_ids": [ + 276 + ], + "source": "Name: select id\nType: PARAMETER_OR_VARIABLE", + "target": "Name: candidate\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "select id", + "tgt_entity_name": "integer", + "relation_name": "", + "weight": 10.0, + "description": "select id is defined as an integer type", + "source_ids": [ + 276 + ], + "source": "Name: select id\nType: PARAMETER_OR_VARIABLE", + "target": "Name: integer\nType: MEASUREMENT" + }, + { + "src_entity_name": "example 1", + "tgt_entity_name": "select id", + "relation_name": "", + "weight": 5.0, + "description": "example 1 is associated with the context of the provided json structure containing select id", + "source_ids": [ + 281 + ], + "source": "Name: select id\nType: PARAMETER_OR_VARIABLE", + "target": "Name: example 1\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "candidate", + "tgt_entity_name": "exact match", + "relation_name": "", + "weight": 8.0, + "description": "the candidate is the subject of the exact match determination", + "source_ids": [ + 276 + ], + "source": "Name: exact match\nType: TASK_OR_PROBLEM", + "target": "Name: candidate\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "selection task", + "tgt_entity_name": "integer", + "relation_name": "", + "weight": 9.0, + "description": "the selection task requires the output to be a single integer", + "source_ids": [ + 282 + ], + "source": "Name: integer\nType: MEASUREMENT", + "target": "Name: selection task\nType: TASK_OR_PROBLEM" + }, + { + "src_entity_name": "example 2", + "tgt_entity_name": "explanation", + "relation_name": "", + "weight": 5.0, + "description": "example 2 is associated with the context of the provided json structure containing explanation", + "source_ids": [ + 281 + ], + "source": "Name: explanation\nType: PARAMETER_OR_VARIABLE", + "target": "Name: example 2\nType: TASK_OR_PROBLEM" + } + ] + }, + "tree2kg": { + "1": [ + "Name: complex documents\nType: DATASET_OR_CORPUS", + "Name: retrieval-augmented generation\nType: TASK_OR_PROBLEM", + "Name: bookrag\nType: MODEL_OR_ARCHITECTURE", + "Name: hierarchical structure-aware index-based approach\nType: METHOD_OR_TECHNIQUE", + "Name: bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents\nType: SECTION_TITLE" + ], + "2": [ + "Name: efficiency\nType: EVALUATION_METRIC", + "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "Name: qa accuracy\nType: EVALUATION_METRIC", + "Name: baselines\nType: MODEL_OR_ARCHITECTURE", + "Name: large language models\nType: MODEL_OR_ARCHITECTURE", + "Name: handbooks\nType: BOOK", + "Name: retrievalaugmented generation\nType: METHOD_OR_TECHNIQUE", + "Name: shu wang\nType: PERSON", + "Name: bookindex\nType: SOFTWARE", + "Name: yingli zhou\nType: PERSON", + "Name: yixiang fang\nType: PERSON", + "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "Name: three widely adopted benchmarks\nType: BENCHMARK", + "Name: graph\nType: SOFTWARE", + "Name: booklets\nType: BOOK", + "Name: retrieval recall\nType: EVALUATION_METRIC", + "Name: books\nType: BOOK", + "Name: question answering\nType: TASK_OR_PROBLEM", + "Name: table of contents\nType: SOFTWARE", + "Name: the chinese university of hong kong shenzhen\nType: ORGANIZATION", + "Name: tree\nType: SOFTWARE", + "Name: industry\nType: ORGANIZATION", + "Name: academia\nType: ORGANIZATION" + ], + "3": [ + "Name: abstract\nType: SECTION_TITLE" + ], + "4": [ + "Name: reference format\nType: SECTION_TITLE", + "Name: pvldb\nType: PUBLICATION_VENUE" + ], + "5": [ + "Name: 19\nType: MEASUREMENT", + "Name: pvldb\nType: PUBLICATION_VENUE", + "Name: xx xx xxx xx\nType: MEASUREMENT", + "Name: bookrag\nType: PRODUCT", + "Name: yingli zhou\nType: PERSON", + "Name: xxx xxx\nType: MEASUREMENT", + "Name: retrieval augmented generation\nType: TECHNOLOGY", + "Name: yixiang fang\nType: PERSON", + "Name: 2025\nType: DATE", + "Name: hierarchical structure aware index based approach\nType: METHOD_OR_TECHNIQUE", + "Name: complex documents\nType: TASK_OR_PROBLEM", + "Name: 1\nType: MEASUREMENT", + "Name: shu wang\nType: PERSON" + ], + "6": [ + "Name: artifact availability\nType: TASK_OR_PROBLEM", + "Name: pvldb\nType: PUBLICATION_VENUE" + ], + "7": [ + "Name: github\nType: ORGANIZATION", + "Name: source code\nType: PRODUCT", + "Name: sam234990\nType: PERSON", + "Name: data\nType: PRODUCT", + "Name: artifacts\nType: PRODUCT", + "Name: bookrag\nType: SOFTWARE" + ], + "8": [ + "Name: 1 introduction\nType: SECTION_TITLE" + ], + "9": [ + "Name: qa system\nType: PRODUCT", + "Name: question answering\nType: TASK_OR_PROBLEM", + "Name: qwen 3\nType: PRODUCT", + "Name: gemini 2 5\nType: PRODUCT", + "Name: large language models\nType: TECHNOLOGY", + "Name: industry\nType: ORGANIZATION", + "Name: users\nType: PERSON" + ], + "10": [ + "Name: owner author s\nType: PERSON", + "Name: info vldb org\nType: EMAIL", + "Name: creative commons by nc nd 4 0 international license\nType: LAW", + "Name: creative commons\nType: ORGANIZATION", + "Name: vldb endowment\nType: ORGANIZATION" + ], + "11": [ + "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "Name: vol 19\nType: MEASUREMENT", + "Name: issn 2150 8097\nType: MEASUREMENT", + "Name: doi xx xx xxx xx\nType: MEASUREMENT", + "Name: no 1\nType: MEASUREMENT" + ], + "12": [ + "Name: figure 1\nType: IMAGE", + "Name: existing methods\nType: METHOD_OR_TECHNIQUE", + "Name: bookrag\nType: PRODUCT", + "Name: complex document qa\nType: TASK_OR_PROBLEM" + ], + "13": [ + "Name: text-only rag\nType: METHOD_OR_TECHNIQUE", + "Name: layout analysis & parsing\nType: METHOD_OR_TECHNIQUE", + "Name: cref='#/texts/14'\nType: IMAGE", + "Name: agent-based retrieval\nType: METHOD_OR_TECHNIQUE", + "Name: bookrag (natively structure-aware)\nType: METHOD_OR_TECHNIQUE", + "Name: bookindex\nType: SYSTEM_COMPONENT", + "Name: flattened chunks\nType: DATASET_OR_CORPUS", + "Name: accurate, structured-grounded\nType: EVALUATION_METRIC", + "Name: unstructured chunks\nType: DATASET_OR_CORPUS", + "Name: text index (vector/graph/tree)\nType: SYSTEM_COMPONENT", + "Name: complex query\nType: TASK_OR_PROBLEM", + "Name: plain text extraction (ocr)\nType: METHOD_OR_TECHNIQUE", + "Name: fixed/ graph retrieval\nType: METHOD_OR_TECHNIQUE", + "Name: fails on structural dependencies\nType: TASK_OR_PROBLEM", + "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "Name: complex multi-page document\nType: PRODUCT", + "Name: fixed retrieval\nType: METHOD_OR_TECHNIQUE", + "Name: loses complex relationships\nType: TASK_OR_PROBLEM", + "Name: hierarchical chunks\nType: DATASET_OR_CORPUS", + "Name: flattened vector index\nType: SYSTEM_COMPONENT", + "Name: layout segmented rag\nType: METHOD_OR_TECHNIQUE" + ], + "14": [ + "Name: retrieval augmented generation\nType: METHOD_OR_TECHNIQUE", + "Name: intricate layouts\nType: SHAPE", + "Name: long form documents\nType: PRODUCT", + "Name: api reference manuals\nType: PRODUCT", + "Name: technical handbooks\nType: PRODUCT", + "Name: external sources\nType: LOCATION", + "Name: tables of contents\nType: PRODUCT", + "Name: financial auditing\nType: TASK_OR_PROBLEM", + "Name: logical hierarchies\nType: CONCEPT", + "Name: response generation\nType: TASK_OR_PROBLEM", + "Name: rag system\nType: SOFTWARE", + "Name: rag\nType: METHOD_OR_TECHNIQUE", + "Name: qa\nType: TASK_OR_PROBLEM", + "Name: domain knowledge\nType: CONCEPT", + "Name: enterprise scenarios\nType: LOCATION", + "Name: multi level sections\nType: PRODUCT", + "Name: legal compliance\nType: TASK_OR_PROBLEM", + "Name: books\nType: PRODUCT", + "Name: scientific discovery\nType: TASK_OR_PROBLEM", + "Name: this paper\nType: BOOK", + "Name: operational guidebooks\nType: PRODUCT", + "Name: llms\nType: TECHNOLOGY", + "Name: nested chapters\nType: PRODUCT" + ], + "15": [ + "Name: rag\nType: TECHNOLOGY", + "Name: table 1\nType: TABLE", + "Name: document level qa\nType: TASK_OR_PROBLEM", + "Name: textual corpus\nType: DATASET_OR_CORPUS", + "Name: hierarchical clusters\nType: TASK_OR_PROBLEM", + "Name: summaries\nType: PRODUCT", + "Name: ocr\nType: TECHNOLOGY", + "Name: high level semantic information\nType: CONCEPT", + "Name: raptor\nType: PRODUCT", + "Name: graph based rag\nType: TECHNOLOGY", + "Name: text based rag method\nType: TECHNOLOGY", + "Name: leiden community detection algorithm\nType: METHOD_OR_TECHNIQUE", + "Name: recursive tree structure\nType: TASK_OR_PROBLEM", + "Name: document chunks\nType: DATASET_OR_CORPUS", + "Name: graph data\nType: DATASET_OR_CORPUS", + "Name: graphrag\nType: PRODUCT", + "Name: knowledge graph\nType: DATASET_OR_CORPUS", + "Name: plain text\nType: MATERIAL", + "Name: figure 1\nType: IMAGE", + "Name: fine grained semantic information\nType: CONCEPT" + ], + "16": [ + "Name: bookrag\nType: PRODUCT", + "Name: representative methods\nType: METHOD_OR_TECHNIQUE", + "Name: table 1\nType: TABLE" + ], + "17": [ + "Name: table: cref='#/texts/17'...\nType: TABLE", + "Name: texts reference\nType: SECTION_TITLE" + ], + "18": [ + "Name: relevant content\nType: CONCEPT", + "Name: multimodal retrieval\nType: METHOD_OR_TECHNIQUE", + "Name: fixed chunk size\nType: MEASUREMENT", + "Name: llm powered operations\nType: TASK_OR_PROBLEM", + "Name: figures\nType: TASK_OR_PROBLEM", + "Name: llm based processing pipelines\nType: TASK_OR_PROBLEM", + "Name: tables\nType: TASK_OR_PROBLEM", + "Name: declarative interface\nType: SOFTWARE", + "Name: processing pipelines\nType: TASK_OR_PROBLEM", + "Name: paragraphs\nType: TASK_OR_PROBLEM", + "Name: equations\nType: TASK_OR_PROBLEM", + "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "Name: first paradigm\nType: TASK_OR_PROBLEM", + "Name: task specific optimizations\nType: METHOD_OR_TECHNIQUE", + "Name: second paradigm\nType: TASK_OR_PROBLEM", + "Name: docetl\nType: SOFTWARE", + "Name: layout aware segmentation\nType: TASK_OR_PROBLEM", + "Name: queries\nType: TASK_OR_PROBLEM", + "Name: document native structural information\nType: CONCEPT" + ], + "19": [ + "Name: l2\nType: TASK_OR_PROBLEM", + "Name: hierarchical blocks\nType: CONCEPT", + "Name: text based approaches\nType: METHOD_OR_TECHNIQUE", + "Name: user queries\nType: TASK_OR_PROBLEM", + "Name: evidence\nType: CONCEPT", + "Name: tables\nType: TABLE", + "Name: simple queries\nType: UNKNOWN", + "Name: layout segmented methods\nType: METHOD_OR_TECHNIQUE", + "Name: section\nType: SECTION_TITLE", + "Name: l1\nType: TASK_OR_PROBLEM", + "Name: static or manually predefined workflows\nType: METHOD_OR_TECHNIQUE", + "Name: multi hop reasoning\nType: TASK_OR_PROBLEM", + "Name: complex queries\nType: UNKNOWN", + "Name: question decomposition\nType: METHOD_OR_TECHNIQUE", + "Name: multi hop questions\nType: TASK_OR_PROBLEM", + "Name: overall performance\nType: EVALUATION_METRIC", + "Name: keyword lookups\nType: TASK_OR_PROBLEM", + "Name: real world qa scenarios\nType: EVENT", + "Name: document\nType: PRODUCT" + ], + "20": [ + "Name: document qa tasks\nType: TASK_OR_PROBLEM", + "Name: table of contents\nType: PRODUCT", + "Name: parsed content blocks\nType: MATERIAL", + "Name: fine grained entities\nType: DATASET_OR_CORPUS", + "Name: tree nodes\nType: PRODUCT", + "Name: hierarchical tree structure\nType: METHOD_OR_TECHNIQUE", + "Name: kg\nType: TECHNOLOGY", + "Name: relation\nType: CONCEPT", + "Name: bookindex\nType: PRODUCT", + "Name: bookrag\nType: TECHNOLOGY" + ], + "21": [ + "Name: similarity distribution\nType: CONCEPT", + "Name: graph connectivity\nType: CONCEPT", + "Name: entity ambiguity\nType: TASK_OR_PROBLEM", + "Name: candidate entities\nType: CONCEPT", + "Name: large language model\nType: PRODUCT", + "Name: llm\nType: PRODUCT", + "Name: multi hop reasoning\nType: TASK_OR_PROBLEM", + "Name: gradient based entity resolution method\nType: METHOD_OR_TECHNIQUE", + "Name: coreferent entities\nType: CONCEPT", + "Name: kg\nType: CONCEPT", + "Name: reasoning capabilities\nType: CONCEPT" + ], + "22": [ + "Name: selector\nType: SOFTWARE", + "Name: retrieval workflows\nType: TASK_OR_PROBLEM", + "Name: information scents\nType: CONCEPT", + "Name: reasoner\nType: SOFTWARE", + "Name: user queries\nType: TASK_OR_PROBLEM", + "Name: evidence\nType: CONCEPT", + "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "Name: search space\nType: TASK_OR_PROBLEM", + "Name: bookindex\nType: PRODUCT", + "Name: agent\nType: UNKNOWN" + ], + "23": [ + "Name: agent based retrieval mechanism\nType: METHOD_OR_TECHNIQUE", + "Name: retrieval recall\nType: EVALUATION_METRIC", + "Name: bookrag\nType: PRODUCT", + "Name: qa accuracy\nType: EVALUATION_METRIC", + "Name: kg\nType: PRODUCT", + "Name: three widely adopted datasets\nType: DATASET_OR_CORPUS", + "Name: state of the art baselines\nType: PRODUCT" + ], + "24": [ + "Name: our contributions\nType: TASK_OR_PROBLEM" + ], + "25": [ + "Name: kg\nType: SOFTWARE", + "Name: entity relations\nType: CONCEPT", + "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "Name: hierarchical tree\nType: MODEL_OR_ARCHITECTURE", + "Name: document layout blocks\nType: MATERIAL", + "Name: bookindex\nType: PRODUCT" + ], + "26": [ + "Name: evidence\nType: TASK_OR_PROBLEM", + "Name: retrieval workflows\nType: TASK_OR_PROBLEM", + "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "Name: queries\nType: TASK_OR_PROBLEM", + "Name: agent based retrieval\nType: TASK_OR_PROBLEM", + "Name: documents\nType: DATASET_OR_CORPUS" + ], + "27": [ + "Name: complex document qa tasks\nType: TASK_OR_PROBLEM", + "Name: state of the art performance\nType: EVALUATION_METRIC", + "Name: bookrag\nType: PRODUCT", + "Name: existing baselines\nType: PRODUCT", + "Name: multiple benchmarks\nType: BENCHMARK", + "Name: extensive experiments\nType: EVENT", + "Name: competitive efficiency\nType: EVALUATION_METRIC" + ], + "28": [ + "Name: 2\nType: NUMBER" + ], + "29": [ + "Name: experimental results\nType: UNKNOWN", + "Name: section 2\nType: SECTION_TITLE", + "Name: ift\nType: METHOD_OR_TECHNIQUE", + "Name: section 4\nType: SECTION_TITLE", + "Name: section 6\nType: SECTION_TITLE", + "Name: structured execution\nType: TASK_OR_PROBLEM", + "Name: bookrag\nType: PRODUCT", + "Name: conclusion\nType: UNKNOWN", + "Name: section 3\nType: SECTION_TITLE", + "Name: rag\nType: METHOD_OR_TECHNIQUE", + "Name: section 7\nType: SECTION_TITLE", + "Name: query classification\nType: METHOD_OR_TECHNIQUE", + "Name: related work\nType: UNKNOWN", + "Name: bookindex\nType: PRODUCT", + "Name: section 5\nType: SECTION_TITLE", + "Name: operators\nType: METHOD_OR_TECHNIQUE" + ], + "30": [ + "Name: hierarchical document structures\nType: TASK_OR_PROBLEM", + "Name: 2 related work\nType: SECTION_TITLE", + "Name: retrieval-augmented generation\nType: METHOD_OR_TECHNIQUE" + ], + "31": [ + "Name: related works\nType: SECTION_TITLE", + "Name: llm\nType: TECHNOLOGY", + "Name: rag approaches\nType: TECHNOLOGY", + "Name: document analysis\nType: RESEARCH_FIELD" + ], + "32": [ + "Name: join\nType: TECHNOLOGY", + "Name: web documents\nType: PRODUCT", + "Name: agentic framework\nType: METHOD_OR_TECHNIQUE", + "Name: lotus\nType: SOFTWARE", + "Name: filter\nType: TECHNOLOGY", + "Name: information extraction\nType: TASK_OR_PROBLEM", + "Name: semi structured web documents\nType: PRODUCT", + "Name: layout\nType: CONCEPT", + "Name: raw text\nType: FILE_TYPE", + "Name: llm\nType: TECHNOLOGY", + "Name: semantic operators\nType: TECHNOLOGY", + "Name: document pages\nType: IMAGE", + "Name: visual information\nType: CONCEPT", + "Name: html\nType: FILE_TYPE", + "Name: sql\nType: PROGRAMMING_LANGUAGE", + "Name: evaporate\nType: SOFTWARE", + "Name: relational tables\nType: PRODUCT", + "Name: docetl\nType: SOFTWARE", + "Name: pdf\nType: FILE_TYPE", + "Name: structured databases\nType: PRODUCT", + "Name: predicates\nType: TECHNOLOGY", + "Name: manual annotation\nType: TASK_OR_PROBLEM", + "Name: unstructured text corpora\nType: UNKNOWN" + ], + "33": [ + "Name: generation fidelity\nType: EVALUATION_METRIC", + "Name: sql rewrite\nType: TASK_OR_PROBLEM", + "Name: autonomous agents\nType: TECHNOLOGY", + "Name: open ended question answering\nType: TASK_OR_PROBLEM", + "Name: hallucination\nType: TASK_OR_PROBLEM", + "Name: recent survey of graph based rag methods\nType: PUBLICATION_VENUE", + "Name: graph structures\nType: TECHNOLOGY", + "Name: external knowledge bases\nType: TECHNOLOGY", + "Name: rag approaches\nType: METHOD_OR_TECHNIQUE", + "Name: agentic rag paradigm\nType: METHOD_OR_TECHNIQUE", + "Name: reasoning robustness\nType: EVALUATION_METRIC", + "Name: data cleaning\nType: TASK_OR_PROBLEM", + "Name: rag\nType: METHOD_OR_TECHNIQUE", + "Name: rag pipeline\nType: TASK_OR_PROBLEM", + "Name: overall retrieval performance\nType: EVALUATION_METRIC", + "Name: documents\nType: UNKNOWN", + "Name: programming context\nType: TASK_OR_PROBLEM", + "Name: naive rag technique\nType: METHOD_OR_TECHNIQUE", + "Name: llms\nType: TECHNOLOGY" + ], + "34": [ + "Name: 3 preliminaries\nType: SECTION_TITLE" + ], + "35": [ + "Name: complex document qa\nType: TASK_OR_PROBLEM", + "Name: ift\nType: SCIENTIFIC_THEORY", + "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "Name: rag systems\nType: TECHNOLOGY", + "Name: research problem\nType: TASK_OR_PROBLEM", + "Name: general workflow\nType: TASK_OR_PROBLEM", + "Name: section\nType: SECTION_TITLE" + ], + "36": [ + "Name: 3.1 problem formulation\nType: SECTION_TITLE", + "Name: complex document qa\nType: TASK_OR_PROBLEM" + ], + "37": [ + "Name: logical chapter hierarchy\nType: TASK_OR_PROBLEM", + "Name: n\nType: PARAMETER_OR_VARIABLE", + "Name: d\nType: PARAMETER_OR_VARIABLE", + "Name: a\nType: PARAMETER_OR_VARIABLE", + "Name: m\nType: PARAMETER_OR_VARIABLE", + "Name: method s\nType: METHOD_OR_TECHNIQUE", + "Name: content blocks\nType: DATASET_OR_CORPUS", + "Name: equation 1\nType: EQUATION_OR_FORMULA", + "Name: text segment\nType: DATASET_OR_CORPUS", + "Name: user query\nType: TASK_OR_PROBLEM", + "Name: references 5 11 33\nType: PUBLICATION_VENUE", + "Name: q\nType: PARAMETER_OR_VARIABLE", + "Name: section header\nType: DATASET_OR_CORPUS", + "Name: question answering\nType: TASK_OR_PROBLEM", + "Name: evidence blocks\nType: DATASET_OR_CORPUS", + "Name: table\nType: DATASET_OR_CORPUS", + "Name: pages\nType: MEASUREMENT", + "Name: b\nType: PARAMETER_OR_VARIABLE", + "Name: e\nType: PARAMETER_OR_VARIABLE", + "Name: document\nType: PRODUCT", + "Name: image\nType: DATASET_OR_CORPUS", + "Name: p\nType: PARAMETER_OR_VARIABLE", + "Name: answer\nType: TASK_OR_PROBLEM" + ], + "38": [ + "Name: 3\nType: MEASUREMENT" + ], + "39": [ + "Name: formula (1)\nType: EQUATION_OR_FORMULA" + ], + "40": [ + "Name: s\nType: PERSON", + "Name: d\nType: TASK_OR_PROBLEM" + ], + "41": [ + "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "Name: 3.2 information foraging theory\nType: SECTION_TITLE" + ], + "42": [ + "Name: information scent\nType: CONCEPT", + "Name: keywords\nType: CONCEPT", + "Name: sections\nType: CONCEPT", + "Name: animal foraging\nType: TASK_OR_PROBLEM", + "Name: icons\nType: CONCEPT", + "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "Name: handbooks\nType: PRODUCT", + "Name: reference 42\nType: PUBLICATION_VENUE", + "Name: information patches\nType: CONCEPT" + ], + "43": [ + "Name: information scent\nType: CONCEPT", + "Name: key terms\nType: CONCEPT", + "Name: large technical handbook\nType: BOOK", + "Name: experts\nType: PERSON", + "Name: precise knowledge\nType: CONCEPT", + "Name: problem\nType: TASK_OR_PROBLEM", + "Name: final answer\nType: CONCEPT", + "Name: information patches\nType: CONCEPT", + "Name: diverse content\nType: CONCEPT" + ], + "44": [ + "Name: 3.3 rag workflow\nType: SECTION_TITLE", + "Name: rag systems\nType: TECHNOLOGY" + ], + "45": [ + "Name: llm\nType: SOFTWARE", + "Name: user query\nType: TASK_OR_PROBLEM", + "Name: unstructured corpus data\nType: DATASET_OR_CORPUS", + "Name: kg\nType: SOFTWARE", + "Name: vector databases\nType: SOFTWARE", + "Name: retrieval augmented generation\nType: TASK_OR_PROBLEM", + "Name: offline indexing phase\nType: TASK_OR_PROBLEM", + "Name: online retrieval phase\nType: TASK_OR_PROBLEM", + "Name: document\nType: TASK_OR_PROBLEM", + "Name: document s native tree topology\nType: TASK_OR_PROBLEM", + "Name: text chunks\nType: DATASET_OR_CORPUS", + "Name: subgraphs\nType: DATASET_OR_CORPUS" + ], + "46": [ + "Name: bookindex\nType: MODEL_OR_ARCHITECTURE", + "Name: hierarchical tree\nType: METHOD_OR_TECHNIQUE", + "Name: graph\nType: TECHNOLOGY", + "Name: 4 bookindex\nType: SECTION_TITLE" + ], + "47": [ + "Name: document\nType: PRODUCT", + "Name: entity relations\nType: CONCEPT", + "Name: fine grained entity knowledge\nType: CONCEPT", + "Name: tree construction\nType: METHOD_OR_TECHNIQUE", + "Name: hierarchical nodes\nType: CONCEPT", + "Name: gradient based entity resolution method\nType: METHOD_OR_TECHNIQUE", + "Name: graph construction\nType: METHOD_OR_TECHNIQUE", + "Name: bookindex\nType: PRODUCT", + "Name: logical hierarchy\nType: CONCEPT" + ], + "48": [ + "Name: kg construction\nType: TASK_OR_PROBLEM", + "Name: gradient based entity resolution\nType: METHOD_OR_TECHNIQUE", + "Name: tree construction\nType: TASK_OR_PROBLEM", + "Name: layout parsing\nType: METHOD_OR_TECHNIQUE", + "Name: figure 2\nType: IMAGE", + "Name: graph construction\nType: TASK_OR_PROBLEM", + "Name: bookindex construction process\nType: TASK_OR_PROBLEM", + "Name: section filtering\nType: METHOD_OR_TECHNIQUE" + ], + "49": [ + "Name: tree construction\nType: TASK_OR_PROBLEM", + "Name: layout parsing\nType: METHOD_OR_TECHNIQUE", + "Name: level: 2 type: section\nType: PARAMETER_OR_VARIABLE", + "Name: bookindex construction\nType: IMAGE", + "Name: tree node\nType: HARDWARE", + "Name: title: moe layer\nType: SECTION_TITLE", + "Name: relation\nType: DATASET_OR_CORPUS", + "Name: section filtering\nType: METHOD_OR_TECHNIQUE", + "Name: merge\nType: TASK_OR_PROBLEM", + "Name: title: experiment\nType: SECTION_TITLE", + "Name: kg construction\nType: METHOD_OR_TECHNIQUE", + "Name: level: none type: text\nType: PARAMETER_OR_VARIABLE", + "Name: similarity\nType: EVALUATION_METRIC", + "Name: graph construction\nType: TASK_OR_PROBLEM", + "Name: bookindex\nType: PRODUCT", + "Name: entity\nType: DATASET_OR_CORPUS", + "Name: gradient-based entity resolution\nType: METHOD_OR_TECHNIQUE", + "Name: gt-link\nType: SOFTWARE", + "Name: image cref='#/texts/52'\nType: UNKNOWN", + "Name: title: method\nType: SECTION_TITLE" + ], + "50": [ + "Name: bookindex\nType: MODEL_OR_ARCHITECTURE", + "Name: graph construction\nType: METHOD_OR_TECHNIQUE", + "Name: 4.1 overview of bookindex\nType: SECTION_TITLE", + "Name: tree construction\nType: METHOD_OR_TECHNIQUE" + ], + "51": [ + "Name: information scent\nType: CONCEPT", + "Name: \nType: UNKNOWN", + "Name: knowledge graph\nType: SOFTWARE", + "Name: document\nType: PRODUCT", + "Name: m v\nType: PARAMETER_OR_VARIABLE", + "Name: graph tree link\nType: METHOD_OR_TECHNIQUE", + "Name: n\nType: PARAMETER_OR_VARIABLE", + "Name: e t\nType: PARAMETER_OR_VARIABLE", + "Name: p\nType: PARAMETER_OR_VARIABLE", + "Name: titles\nType: SECTION_TITLE", + "Name: tables\nType: TABLE", + "Name: e g\nType: PARAMETER_OR_VARIABLE", + "Name: navigation\nType: UNKNOWN", + "Name: v\nType: PARAMETER_OR_VARIABLE", + "Name: bookindex\nType: PRODUCT", + "Name: information patches\nType: CONCEPT", + "Name: tree structure\nType: TASK_OR_PROBLEM", + "Name: sections\nType: SECTION_TITLE" + ], + "52": [ + "Name: graph component\nType: SOFTWARE", + "Name: tree component\nType: SOFTWARE", + "Name: content blocks\nType: PRODUCT", + "Name: document\nType: PRODUCT", + "Name: leaf nodes\nType: PRODUCT", + "Name: gt link\nType: TECHNOLOGY", + "Name: tables\nType: PRODUCT", + "Name: images\nType: PRODUCT", + "Name: text\nType: PRODUCT", + "Name: figure 2\nType: IMAGE", + "Name: section nodes\nType: PRODUCT", + "Name: bookindex\nType: PRODUCT", + "Name: logical hierarchy\nType: CONCEPT", + "Name: semantic entities\nType: CONCEPT" + ], + "53": [ + "Name: 4.2 tree construction\nType: SECTION_TITLE", + "Name: tree construction\nType: METHOD_OR_TECHNIQUE" + ], + "54": [ + "Name: task or problem\nType: UNKNOWN", + "Name: t\nType: TASK_OR_PROBLEM", + "Name: robust layout parsing\nType: METHOD_OR_TECHNIQUE", + "Name: intelligent section filtering\nType: METHOD_OR_TECHNIQUE", + "Name: raw document\nType: PRODUCT" + ], + "55": [ + "Name: document d\nType: TASK_OR_PROBLEM", + "Name: 4.2.1 layout parsing\nType: SECTION_TITLE", + "Name: layout analysis\nType: METHOD_OR_TECHNIQUE", + "Name: content blocks\nType: DATASET_OR_CORPUS", + "Name: recognition models\nType: MODEL_OR_ARCHITECTURE" + ], + "56": [ + "Name: primitive\nType: CONCEPT", + "Name: the output\nType: TASK_OR_PROBLEM" + ], + "57": [ + "Name: title\nType: SECTION_TITLE", + "Name: text\nType: SECTION_TITLE", + "Name: b title\nType: DATASET_OR_CORPUS", + "Name: \nType: UNKNOWN", + "Name: layout parsing\nType: METHOD_OR_TECHNIQUE", + "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "Name: 4 2 2\nType: SECTION_TITLE", + "Name: table\nType: TABLE", + "Name: none\nType: SECTION_TITLE", + "Name: l\nType: PARAMETER_OR_VARIABLE", + "Name: image\nType: IMAGE", + "Name: f\nType: PARAMETER_OR_VARIABLE", + "Name: b\nType: DATASET_OR_CORPUS", + "Name: section filtering\nType: TASK_OR_PROBLEM", + "Name: c\nType: PARAMETER_OR_VARIABLE", + "Name: 1\nType: MEASUREMENT" + ], + "58": [ + "Name: parent child nesting relationships\nType: TASK_OR_PROBLEM", + "Name: re classification\nType: METHOD_OR_TECHNIQUE", + "Name: document order\nType: PARAMETER_OR_VARIABLE", + "Name: final node type\nType: PARAMETER_OR_VARIABLE", + "Name: content\nType: PARAMETER_OR_VARIABLE", + "Name: edge set\nType: TASK_OR_PROBLEM", + "Name: hierarchical levels\nType: PARAMETER_OR_VARIABLE", + "Name: text\nType: PRODUCT", + "Name: section\nType: PRODUCT", + "Name: table\nType: PRODUCT", + "Name: filtering\nType: METHOD_OR_TECHNIQUE", + "Name: node set\nType: TASK_OR_PROBLEM", + "Name: tree\nType: TASK_OR_PROBLEM", + "Name: image\nType: PRODUCT", + "Name: node\nType: UNKNOWN" + ], + "59": [ + "Name: section filtering phase\nType: TASK_OR_PROBLEM", + "Name: title text table\nType: PRODUCT", + "Name: 14\nType: MEASUREMENT", + "Name: level\nType: PARAMETER_OR_VARIABLE", + "Name: layout parsing phase\nType: TASK_OR_PROBLEM", + "Name: 20\nType: MEASUREMENT", + "Name: final tree structure\nType: TASK_OR_PROBLEM", + "Name: text node\nType: SECTION_TITLE", + "Name: document order\nType: PARAMETER_OR_VARIABLE", + "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "Name: fontsize\nType: PARAMETER_OR_VARIABLE", + "Name: none\nType: MEASUREMENT", + "Name: image\nType: IMAGE", + "Name: figure 2\nType: IMAGE", + "Name: section nodes\nType: SECTION_TITLE", + "Name: moe layer\nType: SECTION_TITLE", + "Name: 2\nType: MEASUREMENT", + "Name: method\nType: SECTION_TITLE", + "Name: experiment\nType: SECTION_TITLE" + ], + "60": [ + "Name: 4\nType: MEASUREMENT" + ], + "61": [ + "Name: 4.3 graph construction\nType: SECTION_TITLE", + "Name: graph construction\nType: METHOD_OR_TECHNIQUE", + "Name: gradient-based entity resolution\nType: METHOD_OR_TECHNIQUE" + ], + "62": [ + "Name: tree nodes\nType: TASK_OR_PROBLEM", + "Name: knowledge graph g\nType: TASK_OR_PROBLEM", + "Name: tree t\nType: TASK_OR_PROBLEM" + ], + "63": [ + "Name: llm\nType: SOFTWARE", + "Name: 4.3.1 kg construction\nType: SECTION_TITLE", + "Name: knowledge graph\nType: DATASET_OR_CORPUS", + "Name: tree t\nType: MODEL_OR_ARCHITECTURE", + "Name: mapping m\nType: EQUATION_OR_FORMULA", + "Name: image\nType: IMAGE", + "Name: vision language model\nType: SOFTWARE" + ], + "64": [ + "Name: v table\nType: PRODUCT", + "Name: logical types\nType: CONCEPT", + "Name: structural semantics\nType: CONCEPT", + "Name: formula\nType: PRODUCT", + "Name: header\nType: PRODUCT", + "Name: vertex\nType: CONCEPT", + "Name: row\nType: PRODUCT", + "Name: table\nType: PRODUCT", + "Name: column\nType: PRODUCT", + "Name: node\nType: CONCEPT", + "Name: containedin\nType: RELATIONSHIP_TYPE" + ], + "65": [ + "Name: 4.3.2 gradient-based entity resolution\nType: SECTION_TITLE", + "Name: entity resolution\nType: TASK_OR_PROBLEM", + "Name: gradient-based entity resolution\nType: METHOD_OR_TECHNIQUE" + ], + "66": [ + "Name: o n 2\nType: MEASUREMENT", + "Name: a c\nType: TASK_OR_PROBLEM", + "Name: b c\nType: TASK_OR_PROBLEM", + "Name: er methods\nType: TASK_OR_PROBLEM", + "Name: b\nType: TASK_OR_PROBLEM", + "Name: c\nType: TASK_OR_PROBLEM", + "Name: a b\nType: TASK_OR_PROBLEM", + "Name: a\nType: TASK_OR_PROBLEM", + "Name: 12\nType: PUBLICATION_VENUE", + "Name: dirty er\nType: TASK_OR_PROBLEM", + "Name: llms\nType: TECHNOLOGY" + ], + "67": [ + "Name: database\nType: SOFTWARE", + "Name: repeated lookup task\nType: TASK_OR_PROBLEM", + "Name: scoring patterns\nType: EVALUATION_METRIC", + "Name: entity\nType: PARAMETER_OR_VARIABLE", + "Name: clean er\nType: TASK_OR_PROBLEM", + "Name: incremental process\nType: UNKNOWN", + "Name: top k most relevant candidates\nType: EVALUATION_METRIC", + "Name: quadratic batch problem\nType: TASK_OR_PROBLEM", + "Name: gradient based er method\nType: TECHNOLOGY" + ], + "68": [ + "Name: 5\nType: MEASUREMENT" + ], + "69": [ + "Name: gradient based entity resolution\nType: METHOD_OR_TECHNIQUE", + "Name: algorithm 1\nType: TASK_OR_PROBLEM" + ], + "70": [ + "Name: v\nType: TASK_OR_PROBLEM", + "Name: r\nType: MODEL_OR_ARCHITECTURE", + "Name: g\nType: TASK_OR_PROBLEM", + "Name: top k\nType: PARAMETER_OR_VARIABLE", + "Name: db\nType: DATASET_OR_CORPUS", + "Name: threshold of gradient g\nType: PARAMETER_OR_VARIABLE", + "Name: g\nType: PARAMETER_OR_VARIABLE", + "Name: n\nType: TASK_OR_PROBLEM", + "Name: entity vector database db\nType: DATASET_OR_CORPUS", + "Name: vector search number top k\nType: PARAMETER_OR_VARIABLE", + "Name: kg g\nType: TASK_OR_PROBLEM", + "Name: rerank model r\nType: MODEL_OR_ARCHITECTURE", + "Name: kg\nType: TASK_OR_PROBLEM", + "Name: new entity v n\nType: TASK_OR_PROBLEM" + ], + "71": [ + "Name: score\nType: UNKNOWN", + "Name: r\nType: UNKNOWN", + "Name: e c\nType: UNKNOWN", + "Name: s 0\nType: UNKNOWN", + "Name: s\nType: UNKNOWN", + "Name: v cn\nType: UNKNOWN", + "Name: sort\nType: UNKNOWN", + "Name: gradient select\nType: UNKNOWN", + "Name: c\nType: UNKNOWN", + "Name: sel\nType: UNKNOWN", + "Name: search\nType: UNKNOWN", + "Name: db\nType: UNKNOWN", + "Name: v n\nType: UNKNOWN", + "Name: e\nType: UNKNOWN", + "Name: top k\nType: UNKNOWN", + "Name: vector search\nType: UNKNOWN" + ], + "72": [ + "Name: existing entities\nType: TASK_OR_PROBLEM", + "Name: new entity\nType: TASK_OR_PROBLEM", + "Name: discriminative pattern\nType: PARAMETER_OR_VARIABLE", + "Name: case a\nType: TASK_OR_PROBLEM", + "Name: gradient\nType: PARAMETER_OR_VARIABLE", + "Name: relevance scores\nType: EVALUATION_METRIC" + ], + "73": [ + "Name: irrelevant entities\nType: TASK_OR_PROBLEM", + "Name: gradient\nType: MEASUREMENT", + "Name: \nType: UNKNOWN", + "Name: alias\nType: CONCEPT", + "Name: reranker\nType: TECHNOLOGY", + "Name: existing entity\nType: TASK_OR_PROBLEM", + "Name: equivalent aliases\nType: CONCEPT", + "Name: true match\nType: CONCEPT", + "Name: case b\nType: TASK_OR_PROBLEM", + "Name: scores\nType: EVALUATION_METRIC" + ], + "74": [ + "Name: similar entities\nType: DATASET_OR_CORPUS", + "Name: high relevance set\nType: DATASET_OR_CORPUS", + "Name: case a\nType: TASK_OR_PROBLEM", + "Name: case b\nType: TASK_OR_PROBLEM", + "Name: gradient based er algorithm\nType: TECHNOLOGY", + "Name: llm\nType: TECHNOLOGY" + ], + "75": [ + "Name: lines 7 8\nType: SECTION_TITLE", + "Name: v sel\nType: TASK_OR_PROBLEM", + "Name: line 8\nType: SECTION_TITLE", + "Name: g\nType: PARAMETER_OR_VARIABLE", + "Name: lines 12 14\nType: SECTION_TITLE", + "Name: case b\nType: TASK_OR_PROBLEM", + "Name: sel\nType: TASK_OR_PROBLEM", + "Name: lines 9 14\nType: SECTION_TITLE", + "Name: score\nType: PARAMETER_OR_VARIABLE", + "Name: v c\nType: PARAMETER_OR_VARIABLE", + "Name: llm\nType: SOFTWARE", + "Name: g\nType: TASK_OR_PROBLEM", + "Name: case a\nType: TASK_OR_PROBLEM", + "Name: line 13\nType: SECTION_TITLE", + "Name: line 9 10\nType: SECTION_TITLE", + "Name: v n\nType: TASK_OR_PROBLEM", + "Name: lines 5 8\nType: SECTION_TITLE", + "Name: line 4\nType: SECTION_TITLE", + "Name: s\nType: TASK_OR_PROBLEM", + "Name: line 15\nType: SECTION_TITLE", + "Name: lines 1 3\nType: SECTION_TITLE", + "Name: db\nType: TASK_OR_PROBLEM", + "Name: e c\nType: TASK_OR_PROBLEM", + "Name: algorithm 1\nType: TASK_OR_PROBLEM", + "Name: r\nType: TASK_OR_PROBLEM" + ], + "76": [ + "Name: orange line\nType: IMAGE", + "Name: e 5\nType: TASK_OR_PROBLEM", + "Name: similarity curve\nType: IMAGE", + "Name: gradient based selection process\nType: METHOD_OR_TECHNIQUE", + "Name: e 6\nType: TASK_OR_PROBLEM", + "Name: e 7\nType: TASK_OR_PROBLEM", + "Name: figure 2\nType: IMAGE", + "Name: consolidated information\nType: CONCEPT", + "Name: unique high confidence match\nType: CONCEPT", + "Name: e 9\nType: TASK_OR_PROBLEM", + "Name: kg\nType: TASK_OR_PROBLEM", + "Name: e 8\nType: TASK_OR_PROBLEM" + ], + "77": [ + "Name: origin tree node\nType: HARDWARE", + "Name: entity resolution\nType: TASK_OR_PROBLEM", + "Name: canonical entity\nType: CONCEPT", + "Name: gt link\nType: TECHNOLOGY", + "Name: kg construction phase\nType: TASK_OR_PROBLEM", + "Name: g\nType: CONCEPT", + "Name: t\nType: CONCEPT", + "Name: n\nType: PARAMETER_OR_VARIABLE", + "Name: p n\nType: MATHEMATICAL_CONCEPT", + "Name: v n\nType: PARAMETER_OR_VARIABLE", + "Name: mapping m\nType: EQUATION_OR_FORMULA", + "Name: v sel\nType: PARAMETER_OR_VARIABLE", + "Name: v\nType: PARAMETER_OR_VARIABLE", + "Name: bookindex\nType: PRODUCT", + "Name: m\nType: UNKNOWN", + "Name: v i\nType: PARAMETER_OR_VARIABLE" + ], + "78": [ + "Name: 5 agent-based retrieval\nType: SECTION_TITLE", + "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "Name: agent-based query method\nType: METHOD_OR_TECHNIQUE" + ], + "79": [ + "Name: ift\nType: METHOD_OR_TECHNIQUE", + "Name: agent based planning\nType: METHOD_OR_TECHNIQUE", + "Name: modal type filtering\nType: METHOD_OR_TECHNIQUE", + "Name: bookindex\nType: DATABASE", + "Name: multi hop reasoning\nType: METHOD_OR_TECHNIQUE", + "Name: generation\nType: METHOD_OR_TECHNIQUE", + "Name: semantic selection\nType: METHOD_OR_TECHNIQUE", + "Name: structured execution\nType: METHOD_OR_TECHNIQUE", + "Name: real world document queries\nType: UNKNOWN", + "Name: bookrag\nType: SOFTWARE" + ], + "80": [ + "Name: 5.1 overall workflow\nType: SECTION_TITLE" + ], + "81": [ + "Name: three stage pipeline\nType: METHOD_OR_TECHNIQUE", + "Name: agent based retrieval\nType: TASK_OR_PROBLEM", + "Name: figure 3\nType: IMAGE" + ], + "82": [ + "Name: generation\nType: TASK_OR_PROBLEM", + "Name: rnns\nType: MODEL_OR_ARCHITECTURE", + "Name: bookindex\nType: DATASET_OR_CORPUS", + "Name: agent based planning\nType: TASK_OR_PROBLEM", + "Name: transformer\nType: MODEL_OR_ARCHITECTURE", + "Name: query classification\nType: METHOD_OR_TECHNIQUE", + "Name: operators plan\nType: TASK_OR_PROBLEM", + "Name: classification plan\nType: METHOD_OR_TECHNIQUE", + "Name: retrieval\nType: TASK_OR_PROBLEM", + "Name: bookrag\nType: SOFTWARE" + ], + "83": [ + "Name: generation processes\nType: TASK_OR_PROBLEM", + "Name: planning\nType: TASK_OR_PROBLEM", + "Name: generation\nType: TASK_OR_PROBLEM", + "Name: figure 3\nType: IMAGE", + "Name: agent based planning\nType: TASK_OR_PROBLEM", + "Name: workflow\nType: TASK_OR_PROBLEM", + "Name: agent based retrieval\nType: METHOD_OR_TECHNIQUE", + "Name: retrieval\nType: TASK_OR_PROBLEM", + "Name: bookrag\nType: SOFTWARE" + ], + "84": [ + "Name: cref='#/texts/89'\nType: IMAGE", + "Name: generation process\nType: METHOD_OR_TECHNIQUE", + "Name: retrieval process\nType: METHOD_OR_TECHNIQUE", + "Name: agent-based planning\nType: METHOD_OR_TECHNIQUE", + "Name: question\nType: TASK_OR_PROBLEM", + "Name: answer\nType: TASK_OR_PROBLEM" + ], + "85": [ + "Name: modal type\nType: PARAMETER_OR_VARIABLE", + "Name: information blocks\nType: DATASET_OR_CORPUS", + "Name: bookindex\nType: DATASET_OR_CORPUS", + "Name: operator plan\nType: TASK_OR_PROBLEM", + "Name: scent filter based retrieval\nType: METHOD_OR_TECHNIQUE", + "Name: m\nType: PARAMETER_OR_VARIABLE", + "Name: g\nType: PARAMETER_OR_VARIABLE", + "Name: relevant entities\nType: DATASET_OR_CORPUS", + "Name: t\nType: PARAMETER_OR_VARIABLE", + "Name: retrieval process\nType: TASK_OR_PROBLEM", + "Name: bookrag\nType: SOFTWARE" + ], + "86": [ + "Name: fragmented pieces of evidence\nType: DATASET_OR_CORPUS", + "Name: analysis merging\nType: TASK_OR_PROBLEM", + "Name: retrieved information\nType: DATASET_OR_CORPUS", + "Name: coherent response\nType: PRODUCT", + "Name: generation process\nType: TASK_OR_PROBLEM" + ], + "87": [ + "Name: 5.2 agent-based planning\nType: SECTION_TITLE", + "Name: agent-based planning\nType: METHOD_OR_TECHNIQUE" + ], + "88": [ + "Name: execution pipelines\nType: TASK_OR_PROBLEM", + "Name: bookindex\nType: DATASET_OR_CORPUS", + "Name: selector\nType: MODEL_OR_ARCHITECTURE", + "Name: bookrag\nType: PRODUCT", + "Name: formulator\nType: MODEL_OR_ARCHITECTURE", + "Name: synthesizer\nType: MODEL_OR_ARCHITECTURE", + "Name: agent\nType: TASK_OR_PROBLEM", + "Name: reasoner\nType: MODEL_OR_ARCHITECTURE", + "Name: g\nType: PARAMETER_OR_VARIABLE", + "Name: m\nType: PARAMETER_OR_VARIABLE", + "Name: query categories\nType: TASK_OR_PROBLEM", + "Name: adjustable parameters\nType: PARAMETER_OR_VARIABLE", + "Name: t\nType: PARAMETER_OR_VARIABLE" + ], + "89": [ + "Name: table 2\nType: TABLE", + "Name: bookrag\nType: PRODUCT" + ], + "90": [ + "Name: table: cref='#/texts/95'...\nType: TABLE" + ], + "91": [ + "Name: 6\nType: MEASUREMENT" + ], + "92": [ + "Name: operator set\nType: TASK_OR_PROBLEM" + ], + "93": [ + "Name: step by step operator execution\nType: METHOD_OR_TECHNIQUE", + "Name: execution trace\nType: TASK_OR_PROBLEM", + "Name: agent based planning\nType: METHOD_OR_TECHNIQUE", + "Name: selector\nType: MODEL_OR_ARCHITECTURE", + "Name: formulator\nType: MODEL_OR_ARCHITECTURE", + "Name: synthesizer\nType: MODEL_OR_ARCHITECTURE", + "Name: figure 4\nType: IMAGE", + "Name: reasoner\nType: MODEL_OR_ARCHITECTURE", + "Name: mmlongbench dataset\nType: DATASET_OR_CORPUS", + "Name: bookrag operator library\nType: SOFTWARE", + "Name: single hop\nType: TASK_OR_PROBLEM", + "Name: operator\nType: MODEL_OR_ARCHITECTURE" + ], + "94": [ + "Name: filter\nType: TASK_OR_PROBLEM", + "Name: selector\nType: SYSTEM_COMPONENT", + "Name: mercedes-benz e-class sedan\nType: VEHICLE", + "Name: select\nType: TASK_OR_PROBLEM", + "Name: method and its descendants\nType: SECTION_TITLE", + "Name: execution example\nType: SECTION_TITLE", + "Name: entities\nType: DATASET_OR_CORPUS", + "Name: simple query...\nType: TASK_OR_PROBLEM", + "Name: decompose\nType: METHOD_OR_TECHNIQUE", + "Name: graph\nType: DATA_STRUCTURE", + "Name: reasoner\nType: SYSTEM_COMPONENT", + "Name: method\nType: METHOD_OR_TECHNIQUE", + "Name: operator-set\nType: IMAGE", + "Name: sub-queries\nType: TASK_OR_PROBLEM", + "Name: q: what is the type of car in the ranking prompt example?\nType: TASK_OR_PROBLEM", + "Name: car\nType: PRODUCT", + "Name: image cref='#/texts/98'\nType: UNKNOWN", + "Name: planning\nType: TASK_OR_PROBLEM", + "Name: skyline\nType: TASK_OR_PROBLEM", + "Name: reduce\nType: TASK_OR_PROBLEM", + "Name: text\nType: DATA_STRUCTURE", + "Name: ranking prompt\nType: BOOK", + "Name: reason\nType: TASK_OR_PROBLEM", + "Name: map\nType: TASK_OR_PROBLEM", + "Name: formulator\nType: SYSTEM_COMPONENT", + "Name: s:\nType: PARAMETER_OR_VARIABLE", + "Name: operator plan\nType: TASK_OR_PROBLEM", + "Name: synthesizer\nType: SYSTEM_COMPONENT", + "Name: extract\nType: TASK_OR_PROBLEM", + "Name: a: based on the provided information...\nType: TASK_OR_PROBLEM" + ], + "95": [ + "Name: query classification\nType: TASK_OR_PROBLEM", + "Name: operator plan\nType: PRODUCT" + ], + "96": [ + "Name: query classification\nType: TASK_OR_PROBLEM", + "Name: solution strategy\nType: CONCEPT", + "Name: single hop\nType: EVENT", + "Name: scent based retrieval\nType: METHOD_OR_TECHNIQUE", + "Name: intrinsic complexity\nType: CONCEPT", + "Name: additional operators\nType: SOFTWARE", + "Name: multi hop\nType: EVENT", + "Name: global aggregation\nType: EVENT", + "Name: filter aggregation\nType: METHOD_OR_TECHNIQUE", + "Name: document\nType: OBJECT", + "Name: agent strategy selection\nType: TASK_OR_PROBLEM", + "Name: table 2\nType: TABLE", + "Name: filtering conditions\nType: CONCEPT", + "Name: operational demands\nType: CONCEPT", + "Name: bookrag\nType: SOFTWARE" + ], + "97": [ + "Name: table 3\nType: TABLE", + "Name: o\nType: TASK_OR_PROBLEM", + "Name: agent\nType: TASK_OR_PROBLEM", + "Name: figure 4\nType: IMAGE", + "Name: query categories\nType: TASK_OR_PROBLEM", + "Name: figure 4 a\nType: IMAGE", + "Name: bookindex operators\nType: TASK_OR_PROBLEM", + "Name: classification\nType: METHOD_OR_TECHNIQUE", + "Name: bookindex\nType: PRODUCT" + ], + "98": [ + "Name: formulator\nType: TASK_OR_PROBLEM", + "Name: kg\nType: SOFTWARE", + "Name: sub queries\nType: TASK_OR_PROBLEM", + "Name: query text\nType: TASK_OR_PROBLEM", + "Name: pdec\nType: PARAMETER_OR_VARIABLE", + "Name: extract\nType: METHOD_OR_TECHNIQUE", + "Name: eq\nType: TASK_OR_PROBLEM", + "Name: complex query\nType: TASK_OR_PROBLEM", + "Name: pext\nType: PARAMETER_OR_VARIABLE", + "Name: entities\nType: TASK_OR_PROBLEM", + "Name: llm\nType: TECHNOLOGY", + "Name: decompose\nType: METHOD_OR_TECHNIQUE", + "Name: qs\nType: TASK_OR_PROBLEM" + ], + "99": [ + "Name: formula (2)\nType: EQUATION_OR_FORMULA" + ], + "100": [ + "Name: formula (3)\nType: EQUATION_OR_FORMULA" + ], + "101": [ + "Name: prompt\nType: SOFTWARE", + "Name: q\nType: PARAMETER_OR_VARIABLE", + "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "Name: extraction\nType: TASK_OR_PROBLEM", + "Name: decomposition\nType: TASK_OR_PROBLEM", + "Name: p ext\nType: SOFTWARE", + "Name: p dec\nType: SOFTWARE" + ], + "102": [ + "Name: nodes\nType: PARAMETER_OR_VARIABLE", + "Name: n\nType: PARAMETER_OR_VARIABLE", + "Name: e t\nType: PARAMETER_OR_VARIABLE", + "Name: c n\nType: PARAMETER_OR_VARIABLE", + "Name: modal types\nType: CONCEPT", + "Name: page ranges\nType: CONCEPT", + "Name: n f\nType: PARAMETER_OR_VARIABLE", + "Name: plan\nType: TASK_OR_PROBLEM", + "Name: selector\nType: TECHNOLOGY", + "Name: filter range\nType: TECHNOLOGY", + "Name: edges\nType: PARAMETER_OR_VARIABLE", + "Name: filter modal\nType: TECHNOLOGY", + "Name: bookindex\nType: PRODUCT", + "Name: tree\nType: TASK_OR_PROBLEM", + "Name: c\nType: PARAMETER_OR_VARIABLE" + ], + "103": [ + "Name: formula (4)\nType: EQUATION_OR_FORMULA" + ], + "104": [ + "Name: descendant\nType: TASK_OR_PROBLEM", + "Name: gt link\nType: TECHNOLOGY", + "Name: subtree\nType: TASK_OR_PROBLEM", + "Name: document\nType: TASK_OR_PROBLEM", + "Name: n\nType: TASK_OR_PROBLEM", + "Name: e q\nType: TASK_OR_PROBLEM", + "Name: select by section\nType: TECHNOLOGY", + "Name: section node\nType: TASK_OR_PROBLEM", + "Name: depth\nType: MEASUREMENT", + "Name: select by entity\nType: TECHNOLOGY", + "Name: n s\nType: TASK_OR_PROBLEM", + "Name: llm\nType: TECHNOLOGY", + "Name: s target\nType: TASK_OR_PROBLEM" + ], + "105": [ + "Name: formula (5)\nType: EQUATION_OR_FORMULA" + ], + "106": [ + "Name: 20\nType: PUBLICATION_VENUE", + "Name: 7\nType: EQUATION_OR_FORMULA", + "Name: graph reasoning\nType: METHOD_OR_TECHNIQUE", + "Name: tree node importance scores vector\nType: PARAMETER_OR_VARIABLE", + "Name: 6\nType: EQUATION_OR_FORMULA", + "Name: entity importance vector\nType: PARAMETER_OR_VARIABLE", + "Name: selected tree nodes\nType: UNKNOWN", + "Name: reasoner\nType: TASK_OR_PROBLEM", + "Name: subgraph\nType: TASK_OR_PROBLEM", + "Name: selected nodes\nType: TASK_OR_PROBLEM", + "Name: entity\nType: TASK_OR_PROBLEM", + "Name: pagerank algorithm\nType: METHOD_OR_TECHNIQUE", + "Name: gt link matrix\nType: SOFTWARE" + ], + "107": [ + "Name: formula (6)\nType: EQUATION_OR_FORMULA" + ], + "108": [ + "Name: formula (7)\nType: EQUATION_OR_FORMULA" + ], + "109": [ + "Name: \nType: UNKNOWN", + "Name: tree node\nType: TASK_OR_PROBLEM", + "Name: nodes\nType: TASK_OR_PROBLEM", + "Name: text ranker\nType: SOFTWARE", + "Name: scoring dimensions\nType: PARAMETER_OR_VARIABLE", + "Name: skyline operator\nType: METHOD_OR_TECHNIQUE", + "Name: relevance score\nType: EVALUATION_METRIC", + "Name: skyline ranker\nType: SOFTWARE", + "Name: query\nType: TASK_OR_PROBLEM" + ], + "110": [ + "Name: 7\nType: NUMBER" + ], + "111": [ + "Name: partial answers\nType: PRODUCT", + "Name: reduce\nType: TASK_OR_PROBLEM", + "Name: analysis\nType: TASK_OR_PROBLEM", + "Name: retrieved information segments\nType: DATASET_OR_CORPUS", + "Name: synthesizer\nType: TASK_OR_PROBLEM", + "Name: retrieved evidence\nType: DATASET_OR_CORPUS", + "Name: multiple sources\nType: DATASET_OR_CORPUS", + "Name: partial responses\nType: PRODUCT", + "Name: content generation\nType: TASK_OR_PROBLEM", + "Name: map\nType: TASK_OR_PROBLEM", + "Name: final coherent answer\nType: PRODUCT" + ], + "112": [ + "Name: \nType: UNKNOWN", + "Name: equation 8\nType: EQUATION_OR_FORMULA", + "Name: operators\nType: TASK_OR_PROBLEM", + "Name: operator plan\nType: TASK_OR_PROBLEM", + "Name: category\nType: TASK_OR_PROBLEM", + "Name: library\nType: ORGANIZATION", + "Name: parameters\nType: PARAMETER_OR_VARIABLE", + "Name: agent plan\nType: METHOD_OR_TECHNIQUE", + "Name: 1\nType: TASK_OR_PROBLEM", + "Name: agent\nType: PERSON", + "Name: query\nType: TASK_OR_PROBLEM" + ], + "113": [ + "Name: formula (8)\nType: EQUATION_OR_FORMULA" + ], + "114": [ + "Name: category\nType: CONCEPT", + "Name: the plan\nType: TASK_OR_PROBLEM", + "Name: workflow\nType: METHOD_OR_TECHNIQUE" + ], + "115": [ + "Name: generation\nType: TASK_OR_PROBLEM", + "Name: standard reasoning\nType: TASK_OR_PROBLEM", + "Name: single hop\nType: TASK_OR_PROBLEM", + "Name: p std\nType: EQUATION_OR_FORMULA", + "Name: section based\nType: METHOD_OR_TECHNIQUE", + "Name: scent based\nType: METHOD_OR_TECHNIQUE", + "Name: entity\nType: TASK_OR_PROBLEM", + "Name: agent\nType: PERSON" + ], + "116": [ + "Name: formula (9)\nType: EQUATION_OR_FORMULA" + ], + "117": [ + "Name: formula (10)\nType: EQUATION_OR_FORMULA" + ], + "118": [ + "Name: ps\nType: MODEL_OR_ARCHITECTURE", + "Name: single hop workflow\nType: METHOD_OR_TECHNIQUE", + "Name: agent\nType: PERSON", + "Name: complex\nType: TASK_OR_PROBLEM" + ], + "119": [ + "Name: formula (11)\nType: EQUATION_OR_FORMULA" + ], + "120": [ + "Name: global aggregation\nType: TASK_OR_PROBLEM" + ], + "121": [ + "Name: formula (12)\nType: EQUATION_OR_FORMULA" + ], + "122": [ + "Name: modal filter\nType: TECHNOLOGY", + "Name: \nType: UNKNOWN", + "Name: nested composition\nType: TASK_OR_PROBLEM", + "Name: range filter\nType: TECHNOLOGY" + ], + "123": [ + "Name: retrieval process\nType: METHOD_OR_TECHNIQUE", + "Name: generation\nType: TASK_OR_PROBLEM", + "Name: 5.3 structured execution\nType: SECTION_TITLE", + "Name: ift principles\nType: METHOD_OR_TECHNIQUE" + ], + "124": [ + "Name: synthesizer\nType: SOFTWARE", + "Name: selector\nType: SOFTWARE", + "Name: information patches\nType: TASK_OR_PROBLEM", + "Name: reasoner\nType: SOFTWARE", + "Name: document space\nType: TASK_OR_PROBLEM", + "Name: sensemaking\nType: TASK_OR_PROBLEM", + "Name: answer\nType: TASK_OR_PROBLEM", + "Name: computational resources\nType: TASK_OR_PROBLEM", + "Name: relevant scopes\nType: TASK_OR_PROBLEM", + "Name: workflow\nType: TASK_OR_PROBLEM", + "Name: information foraging theory\nType: SCIENTIFIC_THEORY", + "Name: processed evidence\nType: TASK_OR_PROBLEM", + "Name: cost of attention\nType: TASK_OR_PROBLEM", + "Name: concrete operations\nType: TASK_OR_PROBLEM", + "Name: p\nType: TASK_OR_PROBLEM", + "Name: abstract textual queries\nType: TASK_OR_PROBLEM", + "Name: high value data patches\nType: TASK_OR_PROBLEM", + "Name: bookrag\nType: SOFTWARE" + ], + "125": [ + "Name: node set n\nType: DATASET_OR_CORPUS", + "Name: equation 13\nType: EQUATION_OR_FORMULA", + "Name: selector operators\nType: SOFTWARE", + "Name: ift\nType: METHOD_OR_TECHNIQUE", + "Name: focused node subset ns\nType: DATASET_OR_CORPUS", + "Name: information scents\nType: CONCEPT", + "Name: scent filter based retrieval\nType: TASK_OR_PROBLEM", + "Name: params sel\nType: PARAMETER_OR_VARIABLE", + "Name: patches\nType: PRODUCT", + "Name: question\nType: TASK_OR_PROBLEM", + "Name: explicit filter constraints\nType: METHOD_OR_TECHNIQUE" + ], + "126": [ + "Name: formula (13)\nType: EQUATION_OR_FORMULA" + ], + "127": [ + "Name: foraging cost\nType: MEASUREMENT", + "Name: skyline operator\nType: TASK_OR_PROBLEM", + "Name: reasoner operators\nType: TASK_OR_PROBLEM", + "Name: fixed top retrieval\nType: METHOD_OR_TECHNIQUE", + "Name: pre selection\nType: METHOD_OR_TECHNIQUE", + "Name: nodes\nType: UNKNOWN", + "Name: n r\nType: PARAMETER_OR_VARIABLE", + "Name: n s\nType: PARAMETER_OR_VARIABLE", + "Name: equation 14\nType: EQUATION_OR_FORMULA", + "Name: graph topology\nType: PARAMETER_OR_VARIABLE", + "Name: semantic relevance\nType: PARAMETER_OR_VARIABLE", + "Name: noise\nType: CONCEPT", + "Name: final retrieval set\nType: UNKNOWN", + "Name: pareto frontier\nType: CONCEPT", + "Name: t n\nType: PARAMETER_OR_VARIABLE", + "Name: s g n s\nType: PARAMETER_OR_VARIABLE", + "Name: skyline ranker\nType: TASK_OR_PROBLEM" + ], + "128": [ + "Name: formula (14)\nType: EQUATION_OR_FORMULA" + ], + "129": [ + "Name: synthesizer\nType: SOFTWARE", + "Name: analysis merging generation\nType: TASK_OR_PROBLEM", + "Name: q\nType: PARAMETER_OR_VARIABLE", + "Name: a\nType: PARAMETER_OR_VARIABLE", + "Name: n\nType: PARAMETER_OR_VARIABLE", + "Name: 15\nType: EQUATION_OR_FORMULA" + ], + "130": [ + "Name: formula (15)\nType: EQUATION_OR_FORMULA" + ], + "131": [ + "Name: bookrag\nType: PRODUCT", + "Name: table 3\nType: TABLE" + ], + "132": [ + "Name: table: cref='#/texts/136'...\nType: TABLE", + "Name: cref\nType: EQUATION_OR_FORMULA" + ], + "133": [ + "Name: 8\nType: MEASUREMENT" + ], + "134": [ + "Name: global filter\nType: TASK_OR_PROBLEM", + "Name: sub problems\nType: TASK_OR_PROBLEM", + "Name: intermediate insights\nType: TASK_OR_PROBLEM", + "Name: evidence blocks\nType: TASK_OR_PROBLEM", + "Name: statistical counts\nType: TASK_OR_PROBLEM", + "Name: partial results\nType: TASK_OR_PROBLEM", + "Name: detailed content extraction\nType: TASK_OR_PROBLEM", + "Name: final response\nType: TASK_OR_PROBLEM", + "Name: answers to decomposed sub queries\nType: TASK_OR_PROBLEM", + "Name: reduce operator\nType: TASK_OR_PROBLEM", + "Name: map operator\nType: TASK_OR_PROBLEM", + "Name: high level reasoning synthesis\nType: TASK_OR_PROBLEM", + "Name: decompose\nType: TASK_OR_PROBLEM" + ], + "135": [ + "Name: figure 4 b\nType: IMAGE", + "Name: select by entity\nType: METHOD_OR_TECHNIQUE", + "Name: reduce\nType: METHOD_OR_TECHNIQUE", + "Name: agent\nType: PERSON", + "Name: extract\nType: METHOD_OR_TECHNIQUE", + "Name: planning phase\nType: TASK_OR_PROBLEM", + "Name: single hop\nType: TASK_OR_PROBLEM", + "Name: skyline filtering\nType: METHOD_OR_TECHNIQUE", + "Name: car\nType: PRODUCT", + "Name: reasoning\nType: METHOD_OR_TECHNIQUE", + "Name: answer\nType: TASK_OR_PROBLEM", + "Name: ranking prompt example\nType: TASK_OR_PROBLEM" + ], + "136": [ + "Name: 6 experiments\nType: SECTION_TITLE", + "Name: experiments\nType: TASK_OR_PROBLEM" + ], + "137": [ + "Name: efficiency\nType: EVALUATION_METRIC", + "Name: bookrag\nType: PRODUCT", + "Name: baseline methods\nType: METHOD_OR_TECHNIQUE", + "Name: document qa tasks\nType: TASK_OR_PROBLEM", + "Name: accuracy\nType: EVALUATION_METRIC" + ], + "138": [ + "Name: 6.1 setup\nType: SECTION_TITLE" + ], + "139": [ + "Name: experiments\nType: TASK_OR_PROBLEM", + "Name: datasets\nType: DATASET_OR_CORPUS", + "Name: exact match\nType: EVALUATION_METRIC", + "Name: f1 score\nType: EVALUATION_METRIC", + "Name: em\nType: EVALUATION_METRIC", + "Name: table 4\nType: TABLE", + "Name: our\nType: ORGANIZATION", + "Name: f1\nType: EVALUATION_METRIC" + ], + "140": [ + "Name: texts/143\nType: SECTION_TITLE", + "Name: table: cref='#/texts/143'...\nType: TABLE" + ], + "141": [ + "Name: guidebooks\nType: PRODUCT", + "Name: scientific papers\nType: PRODUCT", + "Name: figures\nType: IMAGE", + "Name: tables\nType: TABLE", + "Name: qasper\nType: DATASET_OR_CORPUS", + "Name: financial reports\nType: PRODUCT", + "Name: rag systems\nType: SOFTWARE", + "Name: qa pairs\nType: TASK_OR_PROBLEM", + "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "Name: industry files\nType: PRODUCT", + "Name: global level questions\nType: TASK_OR_PROBLEM", + "Name: wikipedia pages\nType: LOCATION", + "Name: human annotators\nType: PERSON", + "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "Name: table 4\nType: TABLE", + "Name: html type documents\nType: PRODUCT", + "Name: 20\nType: PERCENTAGE", + "Name: m3docvqa\nType: DATASET_OR_CORPUS", + "Name: complex document qa tasks\nType: TASK_OR_PROBLEM" + ], + "142": [ + "Name: wikipedia\nType: ORGANIZATION", + "Name: https www wikipedia org\nType: LOCATION" + ], + "144": [ + "Name: exact match\nType: EVALUATION_METRIC", + "Name: titles\nType: TABLE", + "Name: ground truth\nType: CONCEPT", + "Name: tables\nType: TABLE", + "Name: qasper\nType: DATASET_OR_CORPUS", + "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "Name: accuracy\nType: EVALUATION_METRIC", + "Name: query\nType: TASK_OR_PROBLEM", + "Name: page numbers\nType: UNKNOWN", + "Name: token usage\nType: EVALUATION_METRIC", + "Name: metadata\nType: CONCEPT", + "Name: token based f1 score\nType: EVALUATION_METRIC", + "Name: evidence statements\nType: UNKNOWN", + "Name: retrieval recall\nType: EVALUATION_METRIC", + "Name: modality\nType: CONCEPT", + "Name: images\nType: TABLE", + "Name: qa\nType: TASK_OR_PROBLEM", + "Name: candidate blocks\nType: TABLE", + "Name: pdf blocks\nType: TABLE", + "Name: pdf parsing\nType: METHOD_OR_TECHNIQUE", + "Name: texts\nType: TABLE", + "Name: time cost\nType: EVALUATION_METRIC", + "Name: response phase\nType: TIME", + "Name: formulas\nType: TABLE" + ], + "145": [ + "Name: our experiments\nType: EVENT", + "Name: three model configurations\nType: MODEL_OR_ARCHITECTURE", + "Name: baselines\nType: TASK_OR_PROBLEM" + ], + "146": [ + "Name: bm25\nType: MODEL_OR_ARCHITECTURE", + "Name: semantic chunking\nType: METHOD_OR_TECHNIQUE", + "Name: document analysis\nType: TASK_OR_PROBLEM", + "Name: raw text\nType: MATERIAL", + "Name: conventional rag\nType: TASK_OR_PROBLEM", + "Name: layout vanilla\nType: MODEL_OR_ARCHITECTURE", + "Name: vanilla rag\nType: MODEL_OR_ARCHITECTURE", + "Name: document layout analysis\nType: METHOD_OR_TECHNIQUE", + "Name: segments\nType: MEASUREMENT" + ], + "147": [ + "Name: graph based rag\nType: TECHNOLOGY", + "Name: local search methods\nType: METHOD_OR_TECHNIQUE", + "Name: retrieval\nType: TASK_OR_PROBLEM", + "Name: graphrag local\nType: TECHNOLOGY", + "Name: graph data\nType: TECHNOLOGY", + "Name: global search methods\nType: METHOD_OR_TECHNIQUE", + "Name: raptor\nType: TECHNOLOGY", + "Name: graphrag global\nType: TECHNOLOGY", + "Name: graphrag\nType: TECHNOLOGY", + "Name: documents\nType: PRODUCT" + ], + "148": [ + "Name: treetraverse\nType: METHOD_OR_TECHNIQUE", + "Name: mm vanilla\nType: PRODUCT", + "Name: page 19\nType: PUBLICATION_VENUE", + "Name: personalized pagerank\nType: METHOD_OR_TECHNIQUE", + "Name: layoutsegmentedrag\nType: METHOD_OR_TECHNIQUE", + "Name: pageindex\nType: PRODUCT", + "Name: page 47\nType: PUBLICATION_VENUE", + "Name: page 39\nType: PUBLICATION_VENUE", + "Name: page 20\nType: PUBLICATION_VENUE", + "Name: hipporag\nType: METHOD_OR_ARCHITECTURE", + "Name: graphranker\nType: METHOD_OR_TECHNIQUE", + "Name: llm\nType: TECHNOLOGY", + "Name: docetl\nType: SOFTWARE" + ], + "149": [ + "Name: detailed configurations\nType: TASK_OR_PROBLEM", + "Name: github com sam234990 bookrag\nType: LOCATION", + "Name: baseline methods\nType: UNKNOWN", + "Name: state of theart\nType: METHOD_OR_TECHNIQUE", + "Name: robust document layout parsing\nType: TASK_OR_PROBLEM", + "Name: bookrag\nType: PRODUCT", + "Name: fair comparison\nType: TASK_OR_PROBLEM", + "Name: implementation details\nType: UNKNOWN", + "Name: mineru\nType: SOFTWARE", + "Name: technical report\nType: PUBLICATION_VENUE", + "Name: prompts\nType: TASK_OR_PROBLEM", + "Name: qwen family\nType: MODEL_OR_ARCHITECTURE", + "Name: appendix\nType: SECTION_TITLE", + "Name: 0 6\nType: MEASUREMENT", + "Name: gradient g\nType: PARAMETER_OR_VARIABLE" + ], + "150": [ + "Name: 6.2 overall results\nType: SECTION_TITLE" + ], + "151": [ + "Name: query efficiency\nType: TASK_OR_PROBLEM", + "Name: bookrag\nType: PRODUCT", + "Name: state of the art baselines\nType: PRODUCT", + "Name: qa\nType: TASK_OR_PROBLEM", + "Name: evaluation\nType: EVENT", + "Name: retrieval\nType: TASK_OR_PROBLEM" + ], + "152": [ + "Name: generation\nType: TASK_OR_PROBLEM", + "Name: vanilla rag\nType: PRODUCT", + "Name: tree graph bookindex\nType: PRODUCT", + "Name: exact match\nType: EVALUATION_METRIC", + "Name: queries\nType: CONCEPT", + "Name: tree traverse\nType: PRODUCT", + "Name: static query workflow\nType: TASK_OR_PROBLEM", + "Name: hierarchical navigation\nType: METHOD_OR_TECHNIQUE", + "Name: top performing baseline\nType: PRODUCT", + "Name: bookrag\nType: PRODUCT", + "Name: graphranker\nType: PRODUCT", + "Name: irrelevant scopes\nType: CONCEPT", + "Name: existing baselines\nType: PRODUCT", + "Name: baselines\nType: PRODUCT", + "Name: graph based reasoning\nType: METHOD_OR_TECHNIQUE", + "Name: context fragmentation\nType: TASK_OR_PROBLEM", + "Name: layout vanilla\nType: PRODUCT", + "Name: qa performance\nType: TASK_OR_PROBLEM", + "Name: agent based planning\nType: PRODUCT", + "Name: m3docvqa\nType: DATASET_OR_CORPUS", + "Name: 18 0\nType: PERCENTAGE", + "Name: table 5\nType: TABLE", + "Name: cross sectional context\nType: CONCEPT", + "Name: retrieval\nType: TASK_OR_PROBLEM", + "Name: workflows\nType: CONCEPT" + ], + "153": [ + "Name: different methods\nType: METHOD_OR_TECHNIQUE", + "Name: bold\nType: COLOR", + "Name: complex document qa tasks\nType: TASK_OR_PROBLEM", + "Name: underlined\nType: SHAPE", + "Name: second best results\nType: EVALUATION_METRIC", + "Name: table 5\nType: TABLE", + "Name: best results\nType: EVALUATION_METRIC", + "Name: performance comparison\nType: TASK_OR_PROBLEM", + "Name: datasets\nType: DATASET_OR_CORPUS" + ], + "154": [ + "Name: cref\nType: PARAMETER_OR_VARIABLE", + "Name: table: cref='#/texts/156'...\nType: TABLE" + ], + "155": [ + "Name: table 6\nType: TABLE", + "Name: retrieval recall\nType: EVALUATION_METRIC", + "Name: layout based methods\nType: METHOD_OR_TECHNIQUE" + ], + "156": [ + "Name: cref='#/texts/158'\nType: TABLE" + ], + "157": [ + "Name: 9 87\nType: MEASUREMENT", + "Name: reasoner\nType: SOFTWARE", + "Name: 44 5\nType: PERCENTAGE", + "Name: ift inspired selector reasoner workflow\nType: METHOD_OR_TECHNIQUE", + "Name: layout based baselines\nType: PRODUCT", + "Name: skyline ranker\nType: SOFTWARE", + "Name: 10\nType: MEASUREMENT", + "Name: query\nType: TASK_OR_PROBLEM", + "Name: selector\nType: SOFTWARE", + "Name: bookrag\nType: PRODUCT", + "Name: three datasets\nType: DATASET_OR_CORPUS", + "Name: graphranker\nType: PRODUCT", + "Name: 71 2\nType: PERCENTAGE", + "Name: 6 86\nType: MEASUREMENT", + "Name: retrieval recall\nType: EVALUATION_METRIC", + "Name: retrieval performance\nType: TASK_OR_PROBLEM", + "Name: ground truth layout blocks\nType: DATASET_OR_CORPUS", + "Name: information patch\nType: TASK_OR_PROBLEM", + "Name: m3docvqa\nType: DATASET_OR_CORPUS", + "Name: standard top k setting\nType: METHOD_OR_TECHNIQUE", + "Name: candidate size\nType: PARAMETER_OR_VARIABLE", + "Name: agent based planning\nType: METHOD_OR_TECHNIQUE", + "Name: 8 6\nType: MEASUREMENT" + ], + "158": [ + "Name: query efficiency\nType: EVALUATION_METRIC", + "Name: figure 5\nType: IMAGE" + ], + "159": [ + "Name: mm-vanilla\nType: METHOD_OR_TECHNIQUE", + "Name: query time\nType: EVALUATION_METRIC", + "Name: bookrag\nType: METHOD_OR_TECHNIQUE", + "Name: token (m)\nType: MEASUREMENT", + "Name: qasper\nType: DATASET_OR_CORPUS", + "Name: graphranker\nType: METHOD_OR_TECHNIQUE", + "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "Name: time (s)\nType: MEASUREMENT", + "Name: raptor\nType: METHOD_OR_TECHNIQUE", + "Name: token cost\nType: EVALUATION_METRIC", + "Name: tree-traverse\nType: METHOD_OR_TECHNIQUE", + "Name: layout + vanilla\nType: METHOD_OR_TECHNIQUE", + "Name: figure 5\nType: IMAGE", + "Name: graphrag-global\nType: METHOD_OR_TECHNIQUE", + "Name: vanilla rag\nType: METHOD_OR_TECHNIQUE", + "Name: image cref='#/texts/161'\nType: UNKNOWN", + "Name: bm25\nType: METHOD_OR_TECHNIQUE", + "Name: docetl\nType: SOFTWARE", + "Name: m3docvqa\nType: DATASET_OR_CORPUS", + "Name: graphrag-local\nType: METHOD_OR_TECHNIQUE" + ], + "160": [ + "Name: vlm\nType: TECHNOLOGY", + "Name: 5 million\nType: MEASUREMENT", + "Name: order of magnitude\nType: MEASUREMENT", + "Name: bookrag\nType: PRODUCT", + "Name: text based rag approaches\nType: TECHNOLOGY", + "Name: graph based rag methods\nType: TECHNOLOGY", + "Name: 53 million tokens\nType: MEASUREMENT", + "Name: figure 5\nType: IMAGE", + "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "Name: docetl\nType: PRODUCT", + "Name: 2\nType: MEASUREMENT" + ], + "161": [ + "Name: 10\nType: MEASUREMENT" + ], + "162": [ + "Name: 6.3 detailed analysis\nType: SECTION_TITLE" + ], + "163": [ + "Name: case study\nType: TASK_OR_PROBLEM", + "Name: entity resolution method\nType: METHOD_OR_TECHNIQUE", + "Name: bookrag\nType: PRODUCT", + "Name: error analysis\nType: METHOD_OR_TECHNIQUE", + "Name: qa performance\nType: EVALUATION_METRIC", + "Name: gradient based er\nType: METHOD_OR_TECHNIQUE", + "Name: ablation study\nType: METHOD_OR_TECHNIQUE", + "Name: query types\nType: TASK_OR_PROBLEM" + ], + "164": [ + "Name: bookrag\nType: PRODUCT", + "Name: ablation study\nType: TASK_OR_PROBLEM" + ], + "165": [ + "Name: same name entities\nType: TASK_OR_PROBLEM", + "Name: gradient er\nType: METHOD_OR_TECHNIQUE", + "Name: w o gradient er\nType: TASK_OR_PROBLEM", + "Name: basic er\nType: METHOD_OR_TECHNIQUE" + ], + "166": [ + "Name: static standard workflow\nType: TASK_OR_PROBLEM", + "Name: planning\nType: TASK_OR_PROBLEM", + "Name: queries\nType: TASK_OR_PROBLEM", + "Name: agent based planning\nType: METHOD_OR_TECHNIQUE" + ], + "167": [ + "Name: candidate nodes\nType: TASK_OR_PROBLEM", + "Name: reasoners\nType: TECHNOLOGY", + "Name: selector\nType: TECHNOLOGY", + "Name: selector operators\nType: TECHNOLOGY" + ], + "168": [ + "Name: skyline ranker\nType: SOFTWARE", + "Name: graph reasoning\nType: TECHNOLOGY" + ], + "169": [ + "Name: text reasoning\nType: TASK_OR_PROBLEM", + "Name: skyline ranker\nType: SOFTWARE" + ], + "170": [ + "Name: exact match\nType: EVALUATION_METRIC", + "Name: bookrag\nType: PRODUCT", + "Name: f1 score\nType: EVALUATION_METRIC", + "Name: table 7\nType: TABLE", + "Name: em\nType: EVALUATION_METRIC", + "Name: qa\nType: TASK_OR_PROBLEM", + "Name: f1\nType: EVALUATION_METRIC" + ], + "171": [ + "Name: table: cref='#/texts/220'...\nType: TABLE", + "Name: cref\nType: PARAMETER_OR_VARIABLE" + ], + "172": [ + "Name: performance loss\nType: EVALUATION_METRIC", + "Name: selector\nType: METHOD_OR_TECHNIQUE", + "Name: w o selector variant\nType: TASK_OR_PROBLEM", + "Name: static workflow\nType: METHOD_OR_TECHNIQUE", + "Name: dynamic skyline filtering strategy\nType: METHOD_OR_TECHNIQUE", + "Name: qasper\nType: DATASET_OR_CORPUS", + "Name: tokens\nType: MEASUREMENT", + "Name: accuracy\nType: EVALUATION_METRIC", + "Name: bookrag\nType: PRODUCT", + "Name: table 7\nType: TABLE", + "Name: narrow then reason strategy\nType: METHOD_OR_TECHNIQUE", + "Name: w o gradient er variant\nType: TASK_OR_PROBLEM", + "Name: multi dimensional reasoning\nType: METHOD_OR_TECHNIQUE", + "Name: planning mechanism\nType: METHOD_OR_TECHNIQUE", + "Name: retrieval performance\nType: EVALUATION_METRIC", + "Name: performance degradation\nType: EVALUATION_METRIC", + "Name: computational cost\nType: MEASUREMENT", + "Name: gradient er\nType: METHOD_OR_TECHNIQUE", + "Name: agent based planning\nType: METHOD_OR_TECHNIQUE", + "Name: kg\nType: DATASET_OR_CORPUS", + "Name: queries\nType: TASK_OR_PROBLEM", + "Name: ift inspired selection mechanism\nType: METHOD_OR_TECHNIQUE" + ], + "173": [ + "Name: 11\nType: NUMBER" + ], + "174": [ + "Name: 3 6e 3\nType: MEASUREMENT", + "Name: absolute values\nType: MEASUREMENT", + "Name: figure 6\nType: IMAGE", + "Name: density values\nType: MEASUREMENT", + "Name: graph statistics\nType: TASK_OR_PROBLEM", + "Name: basic setting\nType: TASK_OR_PROBLEM" + ], + "175": [ + "Name: diameter\nType: PARAMETER_OR_VARIABLE", + "Name: basic\nType: MODEL_OR_ARCHITECTURE", + "Name: 3.6e-3\nType: MEASUREMENT", + "Name: qasper\nType: DATASET_OR_CORPUS", + "Name: cref='#/texts/224'\nType: IMAGE", + "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "Name: 15.0\nType: MEASUREMENT", + "Name: density\nType: PARAMETER_OR_VARIABLE", + "Name: # cc\nType: PARAMETER_OR_VARIABLE", + "Name: 169\nType: MEASUREMENT", + "Name: 531\nType: MEASUREMENT", + "Name: ratio\nType: EVALUATION_METRIC", + "Name: 5.4e-3\nType: MEASUREMENT", + "Name: figure (b)\nType: SECTION_TITLE", + "Name: 1327\nType: MEASUREMENT", + "Name: gradient-based er\nType: MODEL_OR_ARCHITECTURE", + "Name: 106\nType: MEASUREMENT", + "Name: figure (a)\nType: SECTION_TITLE", + "Name: # entity\nType: PARAMETER_OR_VARIABLE", + "Name: 14.8\nType: MEASUREMENT" + ], + "176": [ + "Name: gradient based entity resolution\nType: TASK_OR_PROBLEM", + "Name: basic kg construction\nType: TASK_OR_PROBLEM", + "Name: number of connected components\nType: EVALUATION_METRIC", + "Name: datasets\nType: DATASET_OR_CORPUS", + "Name: figure 6\nType: IMAGE", + "Name: basic baseline\nType: BENCHMARK", + "Name: many graph based methods\nType: ORGANIZATION", + "Name: graph reasoning\nType: TASK_OR_PROBLEM", + "Name: entity count\nType: EVALUATION_METRIC", + "Name: diameter of the largest connected component\nType: EVALUATION_METRIC", + "Name: er module\nType: METHOD_OR_TECHNIQUE", + "Name: 20\nType: PERCENTAGE", + "Name: 12\nType: PERCENTAGE", + "Name: density\nType: EVALUATION_METRIC" + ], + "177": [ + "Name: figure 7\nType: IMAGE", + "Name: global\nType: TASK_OR_PROBLEM", + "Name: blue bars\nType: IMAGE", + "Name: red bars\nType: IMAGE", + "Name: exact match\nType: EVALUATION_METRIC", + "Name: multi hop\nType: TASK_OR_PROBLEM", + "Name: f1 score\nType: EVALUATION_METRIC", + "Name: single hop\nType: TASK_OR_PROBLEM", + "Name: qa\nType: TASK_OR_PROBLEM", + "Name: qasper\nType: DATASET_OR_CORPUS", + "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "Name: accuracy\nType: EVALUATION_METRIC", + "Name: query types\nType: TASK_OR_PROBLEM" + ], + "178": [ + "Name: (a) mmlongbench\nType: DATASET_OR_CORPUS", + "Name: global\nType: TASK_OR_PROBLEM", + "Name: multi\nType: TASK_OR_PROBLEM", + "Name: f1-score\nType: EVALUATION_METRIC", + "Name: (b) qasper\nType: DATASET_OR_CORPUS", + "Name: em / accuracy\nType: EVALUATION_METRIC", + "Name: single\nType: TASK_OR_PROBLEM", + "Name: cref='#/texts/259'\nType: IMAGE", + "Name: score\nType: PARAMETER_OR_VARIABLE" + ], + "179": [ + "Name: figure 7\nType: IMAGE", + "Name: global aggregation\nType: TASK_OR_PROBLEM", + "Name: bookrag\nType: PRODUCT", + "Name: retrieving\nType: METHOD_OR_TECHNIQUE", + "Name: single hop\nType: TASK_OR_PROBLEM", + "Name: multihop\nType: TASK_OR_PROBLEM", + "Name: reasoning\nType: METHOD_OR_TECHNIQUE", + "Name: disjoint pieces of evidence\nType: DATASET_OR_CORPUS", + "Name: agent based planning strategy\nType: METHOD_OR_TECHNIQUE", + "Name: qa performance\nType: TASK_OR_PROBLEM", + "Name: query types\nType: TASK_OR_PROBLEM" + ], + "180": [ + "Name: four types\nType: MEASUREMENT", + "Name: 200 sampled queries\nType: MEASUREMENT", + "Name: error response analysis\nType: TASK_OR_PROBLEM", + "Name: bookrag\nType: PRODUCT", + "Name: figure 9\nType: IMAGE" + ], + "181": [ + "Name: gray text\nType: COLOR", + "Name: case study\nType: EVENT", + "Name: cyan text\nType: COLOR", + "Name: query types\nType: TASK_OR_PROBLEM", + "Name: figure 8\nType: IMAGE", + "Name: internal process\nType: TASK_OR_PROBLEM", + "Name: qasper\nType: DATASET_OR_CORPUS", + "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "Name: bookrag\nType: SOFTWARE" + ], + "182": [ + "Name: cross-entropy\nType: EVALUATION_METRIC", + "Name: single-hop case from qasper\nType: SECTION_TITLE", + "Name: lstm with elmo system\nType: MODEL_OR_ARCHITECTURE", + "Name: agent-based planning\nType: METHOD_OR_TECHNIQUE", + "Name: table 1\nType: TABLE", + "Name: reduce\nType: SOFTWARE", + "Name: qasper\nType: DATASET_OR_CORPUS", + "Name: decompose operator\nType: SOFTWARE", + "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "Name: graph_reasoning\nType: TASK_OR_PROBLEM", + "Name: bookrag response of different query types\nType: IMAGE", + "Name: discount factor\nType: PARAMETER_OR_VARIABLE", + "Name: text_reasoning\nType: TASK_OR_PROBLEM", + "Name: skyline_ranker\nType: SOFTWARE", + "Name: global aggregation case from mmlongbench\nType: SECTION_TITLE", + "Name: diacritic swapping\nType: METHOD_OR_TECHNIQUE", + "Name: multi-hop case from qasper\nType: SECTION_TITLE", + "Name: interpretable system\nType: MODEL_OR_ARCHITECTURE", + "Name: select_by_entity operator\nType: SOFTWARE", + "Name: filter_range\nType: SOFTWARE", + "Name: filter operators\nType: SOFTWARE", + "Name: image cref='#/texts/282'\nType: UNKNOWN", + "Name: binary reward system\nType: TECHNOLOGY", + "Name: lstm-elmo net\nType: MODEL_OR_ARCHITECTURE", + "Name: filter_modal\nType: SOFTWARE" + ], + "183": [ + "Name: qasper\nType: DATASET_OR_CORPUS", + "Name: mmlongbench\nType: DATASET_OR_CORPUS", + "Name: 200\nType: MEASUREMENT", + "Name: error analysis\nType: TASK_OR_PROBLEM", + "Name: figure 9\nType: IMAGE" + ], + "184": [ + "Name: (a) mmlongbench\nType: DATASET_OR_CORPUS", + "Name: correct (79)\nType: EVALUATION_METRIC", + "Name: plan error (27)\nType: TASK_OR_PROBLEM", + "Name: all queries (200)\nType: MEASUREMENT", + "Name: correct (117)\nType: EVALUATION_METRIC", + "Name: cref='#/texts/348'\nType: IMAGE", + "Name: successful parsing (194)\nType: MEASUREMENT", + "Name: retrieval error (26)\nType: TASK_OR_PROBLEM", + "Name: (b) qasper\nType: DATASET_OR_CORPUS", + "Name: successful parsing (193)\nType: MEASUREMENT", + "Name: generation error (30)\nType: TASK_OR_PROBLEM", + "Name: plan error (20)\nType: TASK_OR_PROBLEM", + "Name: parsing error (6)\nType: TASK_OR_PROBLEM", + "Name: retrieval error (52)\nType: TASK_OR_PROBLEM", + "Name: generation error (36)\nType: TASK_OR_PROBLEM", + "Name: parsing error (7)\nType: TASK_OR_PROBLEM" + ], + "185": [ + "Name: cohesive final answer\nType: TASK_OR_PROBLEM", + "Name: generation\nType: TASK_OR_PROBLEM", + "Name: pdf parsing\nType: TASK_OR_PROBLEM", + "Name: multimodal evidence\nType: TASK_OR_PROBLEM", + "Name: retrieval error\nType: TASK_OR_PROBLEM", + "Name: single hop queries\nType: TASK_OR_PROBLEM", + "Name: multi hop sub tasks\nType: TASK_OR_PROBLEM", + "Name: qualitative analysis\nType: METHOD_OR_TECHNIQUE", + "Name: model\nType: TASK_OR_PROBLEM", + "Name: fragmentation\nType: TASK_OR_PROBLEM", + "Name: generation error\nType: TASK_OR_PROBLEM", + "Name: plan\nType: TASK_OR_PROBLEM", + "Name: planner\nType: TASK_OR_PROBLEM", + "Name: disjointed retrieval paths\nType: TASK_OR_PROBLEM", + "Name: plan error\nType: TASK_OR_PROBLEM", + "Name: results\nType: TASK_OR_PROBLEM", + "Name: scattered sub responses\nType: TASK_OR_PROBLEM", + "Name: retrieval\nType: TASK_OR_PROBLEM" + ], + "186": [ + "Name: 24\nType: MEASUREMENT", + "Name: case study\nType: TASK_OR_PROBLEM", + "Name: search spaces\nType: TASK_OR_PROBLEM", + "Name: relevant evidence\nType: TASK_OR_PROBLEM", + "Name: multi hop\nType: TASK_OR_PROBLEM", + "Name: figure 8\nType: IMAGE", + "Name: bookrag\nType: PRODUCT", + "Name: noise\nType: TASK_OR_PROBLEM", + "Name: precise answer generation\nType: TASK_OR_PROBLEM", + "Name: single hop\nType: TASK_OR_PROBLEM", + "Name: answering workflow\nType: TASK_OR_PROBLEM", + "Name: filter\nType: METHOD_OR_TECHNIQUE", + "Name: 134\nType: MEASUREMENT", + "Name: select\nType: METHOD_OR_TECHNIQUE", + "Name: global queries\nType: TASK_OR_PROBLEM", + "Name: decompose\nType: METHOD_OR_TECHNIQUE" + ], + "187": [ + "Name: 7 conclusion\nType: SECTION_TITLE" + ], + "188": [ + "Name: reasoning operators\nType: SOFTWARE", + "Name: agent based method\nType: METHOD_OR_TECHNIQUE", + "Name: answer accuracy\nType: EVALUATION_METRIC", + "Name: paper\nType: PUBLICATION_VENUE", + "Name: knowledge extraction\nType: TASK_OR_PROBLEM", + "Name: retrieval operators\nType: SOFTWARE", + "Name: intelligent querying\nType: TASK_OR_PROBLEM", + "Name: bookrag\nType: PRODUCT", + "Name: benchmarks\nType: BENCHMARK", + "Name: tree graph index\nType: TECHNOLOGY", + "Name: existing baselines\nType: PRODUCT", + "Name: document native database system\nType: PRODUCT", + "Name: book index\nType: PRODUCT", + "Name: data formatting\nType: TASK_OR_PROBLEM", + "Name: retrieval precision\nType: EVALUATION_METRIC" + ], + "189": [ + "Name: 12\nType: MEASUREMENT" + ], + "190": [ + "Name: references\nType: SECTION_TITLE" + ], + "191": [ + "Name: christopher r\nType: PERSON", + "Name: simran arora\nType: PERSON", + "Name: heterogeneous data lakes\nType: DATASET_OR_CORPUS", + "Name: structured views\nType: PRODUCT", + "Name: 92 105\nType: MEASUREMENT", + "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "Name: 2023\nType: DATE", + "Name: andrew hojel\nType: PERSON", + "Name: language models\nType: TECHNOLOGY", + "Name: 17\nType: MEASUREMENT", + "Name: sabri eyuboglu\nType: PERSON", + "Name: avanika narayan\nType: PERSON", + "Name: simple systems\nType: PRODUCT", + "Name: brandon yang\nType: PERSON", + "Name: immanuel trummer\nType: PERSON", + "Name: 2\nType: MEASUREMENT", + "Name: vldb endowment\nType: ORGANIZATION" + ], + "192": [ + "Name: 2024\nType: DATE", + "Name: yizhong wang\nType: PERSON", + "Name: international conference on learning representations\nType: PUBLICATION_VENUE", + "Name: self rag\nType: MODEL_OR_ARCHITECTURE", + "Name: et al\nType: PERSON", + "Name: iclr\nType: PUBLICATION_VENUE", + "Name: zeqiu wu\nType: PERSON", + "Name: akari asai\nType: PERSON" + ], + "193": [ + "Name: learning to retrieve generate and critique through self reflection\nType: METHOD_OR_TECHNIQUE", + "Name: arxiv preprint arxiv 2310 11511\nType: PUBLICATION_VENUE", + "Name: 2023\nType: DATE", + "Name: yizhong wang\nType: PERSON", + "Name: self rag\nType: MODEL_OR_ARCHITECTURE", + "Name: zeqiu wu\nType: PERSON", + "Name: arxiv\nType: ORGANIZATION", + "Name: akari asai\nType: PERSON", + "Name: avirup sil\nType: PERSON", + "Name: hannaneh hajishirzi\nType: PERSON" + ], + "194": [ + "Name: preprint\nType: FILE_TYPE", + "Name: keqin chen\nType: PERSON", + "Name: qwen2 5 vl\nType: MODEL_OR_ARCHITECTURE", + "Name: shijie wang\nType: PERSON", + "Name: sibo song\nType: PERSON", + "Name: peng wang\nType: PERSON", + "Name: et al\nType: PERSON", + "Name: jun tang\nType: PERSON", + "Name: technical report\nType: PUBLICATION_VENUE", + "Name: jialin wang\nType: PERSON", + "Name: wenbin ge\nType: PERSON", + "Name: xuejing liu\nType: PERSON", + "Name: 2025\nType: DATE", + "Name: shuai bai\nType: PERSON", + "Name: arxiv 2502 13923\nType: FILE_TYPE", + "Name: qwen2 5 vl technical report\nType: PUBLICATION_VENUE", + "Name: arxiv\nType: PUBLICATION_VENUE", + "Name: kai dang\nType: PERSON" + ], + "195": [ + "Name: challenges\nType: TASK_OR_PROBLEM", + "Name: preprint\nType: FILE_TYPE", + "Name: question answering\nType: TASK_OR_PROBLEM", + "Name: yoan chabot\nType: PERSON", + "Name: survey on question answering over visually rich documents methods challenges and trends\nType: BOOK", + "Name: methods\nType: METHOD_OR_TECHNIQUE", + "Name: arxiv 2501 02235\nType: FILE_TYPE", + "Name: camille barboule\nType: PERSON", + "Name: benjamin piwowarski\nType: PERSON", + "Name: 2025\nType: DATE", + "Name: visually rich documents\nType: DATASET_OR_CORPUS", + "Name: trends\nType: RESEARCH_FIELD", + "Name: arxiv\nType: PUBLICATION_VENUE" + ], + "196": [ + "Name: zengyi gao\nType: PERSON", + "Name: jianliang xu\nType: PERSON", + "Name: xike xie\nType: PERSON", + "Name: proc vldb endow\nType: PUBLICATION_VENUE", + "Name: 10\nType: MEASUREMENT", + "Name: modularizing\nType: METHOD_OR_TECHNIQUE", + "Name: graph based retrieval augmented generation\nType: METHOD_OR_TECHNIQUE", + "Name: https doi org 10 14778 3748191 3748194\nType: URL", + "Name: 18\nType: MEASUREMENT", + "Name: design space exploration\nType: TASK_OR_PROBLEM", + "Name: yukun cao\nType: PERSON", + "Name: june 2025\nType: DATE", + "Name: 2025\nType: DATE", + "Name: lego graphrag\nType: PRODUCT", + "Name: 3269 3283\nType: MEASUREMENT", + "Name: zhiyang li\nType: PERSON", + "Name: s kevin zhou\nType: PERSON" + ], + "197": [ + "Name: 11\nType: MEASUREMENT", + "Name: jiajun li\nType: PERSON", + "Name: lei cao\nType: PERSON", + "Name: guoren wang\nType: PERSON", + "Name: proceedings of the vldb endowment\nType: PUBLICATION_VENUE", + "Name: 3695 3707\nType: MEASUREMENT", + "Name: budget aware structural table extraction\nType: TASK_OR_PROBLEM", + "Name: chengliang chai\nType: PERSON", + "Name: 18\nType: MEASUREMENT", + "Name: unstructured documents\nType: DATASET_OR_CORPUS", + "Name: yuhao deng\nType: PERSON", + "Name: yuanhao zhong\nType: PERSON", + "Name: 2025\nType: DATE", + "Name: doctopus\nType: PRODUCT", + "Name: ye yuan\nType: PERSON" + ], + "198": [ + "Name: arxiv preprint arxiv 2010 02559\nType: PUBLICATION_VENUE", + "Name: ilias chalkidis\nType: PERSON", + "Name: legal bert\nType: MODEL_OR_ARCHITECTURE", + "Name: 2020\nType: DATE", + "Name: law school\nType: LOCATION", + "Name: ion androutsopoulos\nType: PERSON", + "Name: manos fergadiotis\nType: PERSON", + "Name: prodromos malakasiotis\nType: PERSON", + "Name: muppets\nType: PRODUCT", + "Name: nikolaos aletras\nType: PERSON" + ], + "199": [ + "Name: 3\nType: MEASUREMENT", + "Name: yeye he\nType: PERSON", + "Name: 2024\nType: DATE", + "Name: haidong zhang\nType: PERSON", + "Name: sibei chen\nType: PERSON", + "Name: dongmei zhang\nType: PERSON", + "Name: auto formula\nType: PRODUCT", + "Name: contrastive learning\nType: METHOD_OR_TECHNIQUE", + "Name: 1 27\nType: MEASUREMENT", + "Name: surajit chaudhuri\nType: PERSON", + "Name: spreadsheets\nType: PRODUCT", + "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE", + "Name: table representations\nType: DATASET_OR_CORPUS", + "Name: ju fan\nType: PERSON", + "Name: formulas\nType: PRODUCT", + "Name: song ge\nType: PERSON", + "Name: weiwei cui\nType: PERSON", + "Name: 2\nType: MEASUREMENT" + ], + "200": [ + "Name: data preparation\nType: TASK_OR_PROBLEM", + "Name: human generated pipelines\nType: METHOD_OR_TECHNIQUE", + "Name: xiaoyong du\nType: PERSON", + "Name: 2023\nType: DATE", + "Name: haipipe\nType: PRODUCT", + "Name: chengliang chai\nType: PERSON", + "Name: sibei chen\nType: PERSON", + "Name: 1 26\nType: MEASUREMENT", + "Name: acm\nType: ORGANIZATION", + "Name: guoliang li\nType: PERSON", + "Name: proceedings of the acm on management of data\nType: PUBLICATION_VENUE", + "Name: ju fan\nType: PERSON", + "Name: xuemi yan\nType: PERSON", + "Name: machine generated pipelines\nType: METHOD_OR_TECHNIQUE", + "Name: 1\nType: MEASUREMENT", + "Name: nan tang\nType: PERSON" + ], + "201": [ + "Name: jaemin cho\nType: PERSON", + "Name: multi modal retrieval\nType: METHOD_OR_TECHNIQUE", + "Name: arxiv 2411 04952\nType: FILE_TYPE", + "Name: yujie he\nType: PERSON", + "Name: 2024\nType: DATE", + "Name: multi page multidocument understanding\nType: TASK_OR_PROBLEM", + "Name: ozan irsoy\nType: PERSON", + "Name: debanjan mahata\nType: PERSON", + "Name: m3docrag\nType: PRODUCT", + "Name: arxiv preprint\nType: PUBLICATION_VENUE", + "Name: arxiv\nType: PUBLICATION_VENUE", + "Name: mohit bansal\nType: PERSON" + ], + "202": [ + "Name: 6\nType: MEASUREMENT", + "Name: vassilis christophides\nType: PERSON", + "Name: george papadakis\nType: PERSON", + "Name: vasilis efthymiou\nType: PERSON", + "Name: 53\nType: MEASUREMENT", + "Name: an overview of end to end entity resolution for big data\nType: BOOK", + "Name: 2020\nType: DATE", + "Name: 1 42\nType: MEASUREMENT", + "Name: end to end entity resolution\nType: TASK_OR_PROBLEM", + "Name: csur\nType: PUBLICATION_VENUE", + "Name: big data\nType: DATASET_OR_CORPUS", + "Name: kostas stefanidis\nType: PERSON", + "Name: themis palpanas\nType: PERSON", + "Name: acm computing surveys\nType: PUBLICATION_VENUE" + ], + "203": [ + "Name: evan rosen\nType: PERSON", + "Name: inderjit dhillon\nType: PERSON", + "Name: eric bieber\nType: PERSON", + "Name: gemini 2 5 pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities\nType: BOOK", + "Name: marcel blistein\nType: PERSON", + "Name: arxiv\nType: PUBLICATION_VENUE", + "Name: next generation agentic capabilities\nType: TASK_OR_PROBLEM", + "Name: ori ram\nType: PERSON", + "Name: 2025\nType: DATE", + "Name: multimodality\nType: TASK_OR_PROBLEM", + "Name: arxiv 2507 06261\nType: FILE_TYPE", + "Name: mike schaekermann\nType: PERSON", + "Name: noveen sachdeva\nType: PERSON", + "Name: ice pasupat\nType: PERSON", + "Name: gemini 2 5\nType: PRODUCT", + "Name: et al\nType: PERSON", + "Name: long context\nType: TASK_OR_PROBLEM", + "Name: arxiv preprint\nType: FILE_TYPE", + "Name: advanced reasoning\nType: TASK_OR_PROBLEM", + "Name: dan zhang\nType: PERSON", + "Name: gheorghe comanici\nType: PERSON" + ], + "204": [ + "Name: noah a smith\nType: PERSON", + "Name: pradeep dasigi\nType: PERSON", + "Name: kyle lo\nType: PERSON", + "Name: matt gardner\nType: PERSON", + "Name: arxiv preprint arxiv 2105 03011\nType: PUBLICATION_VENUE", + "Name: 2021\nType: DATE", + "Name: research papers\nType: DATASET_OR_CORPUS", + "Name: a dataset of information seeking questions and answers anchored in research papers\nType: PRODUCT", + "Name: arman cohan\nType: PERSON", + "Name: iz beltagy\nType: PERSON", + "Name: answers\nType: TASK_OR_PROBLEM", + "Name: information seeking questions\nType: TASK_OR_PROBLEM" + ], + "205": [ + "Name: arxiv preprint arxiv 2302 09051\nType: PUBLICATION_VENUE", + "Name: emmanuel bruno\nType: PERSON", + "Name: elisabeth murisasco\nType: PERSON", + "Name: xavier daull\nType: PERSON", + "Name: 2023\nType: DATE", + "Name: patrice bellot\nType: PERSON", + "Name: arxiv\nType: ORGANIZATION", + "Name: complex qa and language models hybrid architectures survey\nType: BOOK", + "Name: 2302 09051\nType: FILE_TYPE", + "Name: vincent martin\nType: PERSON" + ], + "206": [ + "Name: jonathan larson\nType: PERSON", + "Name: from local to global a graph rag approach to query focused summarization\nType: BOOK", + "Name: ha trinh\nType: PERSON", + "Name: apurva mody\nType: PERSON", + "Name: global\nType: CONCEPT", + "Name: 2024\nType: DATE", + "Name: query focused summarization\nType: TASK_OR_PROBLEM", + "Name: darren edge\nType: PERSON", + "Name: steven truitt\nType: PERSON", + "Name: arxiv 2404 16130\nType: PUBLICATION_VENUE", + "Name: alex chao\nType: PERSON", + "Name: joshua bradley\nType: PERSON", + "Name: newman cheng\nType: PERSON", + "Name: local\nType: CONCEPT", + "Name: graph rag\nType: TECHNOLOGY", + "Name: arxiv\nType: PUBLICATION_VENUE" + ], + "207": [ + "Name: xinyu gao\nType: PERSON", + "Name: jinliu pan\nType: PERSON", + "Name: jiawei sun\nType: PERSON", + "Name: yunfan gao\nType: PERSON", + "Name: 2023\nType: DATE", + "Name: retrieval augmented generation\nType: TECHNOLOGY", + "Name: kangxiang jia\nType: PERSON", + "Name: haofen wang\nType: PERSON", + "Name: yuxi bi\nType: PERSON", + "Name: yun xiong\nType: PERSON", + "Name: large language models\nType: TECHNOLOGY", + "Name: arxiv\nType: ORGANIZATION", + "Name: retrieval augmented generation for large language models a survey\nType: BOOK", + "Name: 2312 10997\nType: FILE_TYPE", + "Name: arxiv preprint arxiv 2312 10997\nType: PUBLICATION_VENUE", + "Name: yi dai\nType: PERSON" + ], + "208": [ + "Name: simple\nType: CONCEPT", + "Name: lightrag\nType: PRODUCT", + "Name: 2024\nType: DATE", + "Name: chao huang\nType: PERSON", + "Name: fast\nType: CONCEPT", + "Name: retrieval augmented generation\nType: TECHNOLOGY", + "Name: tu ao\nType: PERSON", + "Name: zirui guo\nType: PERSON", + "Name: lianghao xia\nType: PERSON", + "Name: yanhua yu\nType: PERSON", + "Name: arxiv e prints\nType: PUBLICATION_VENUE", + "Name: arxiv2410\nType: FILE_TYPE" + ], + "209": [ + "Name: arxiv 2405 14831\nType: FILE_TYPE", + "Name: neurobiologically inspired long term memory\nType: TASK_OR_PROBLEM", + "Name: yu su\nType: PERSON", + "Name: 2024\nType: DATE", + "Name: michihiro yasunaga\nType: PERSON", + "Name: yu gu\nType: PERSON", + "Name: bernal jim nez guti rrez\nType: PERSON", + "Name: hipporag\nType: MODEL_OR_ARCHITECTURE", + "Name: large language models\nType: PRODUCT", + "Name: yiheng shu\nType: PERSON", + "Name: arxiv\nType: PUBLICATION_VENUE" + ], + "210": [ + "Name: topic sensitive pagerank\nType: TECHNOLOGY", + "Name: 517 526\nType: MEASUREMENT", + "Name: world wide web\nType: TECHNOLOGY", + "Name: taher h haveliwala\nType: PERSON", + "Name: 11th international conference on world wide web\nType: EVENT", + "Name: 2002\nType: DATE" + ], + "211": [ + "Name: yann lecun\nType: PERSON", + "Name: retrieval augmented generation\nType: METHOD_OR_TECHNIQUE", + "Name: 2024\nType: DATE", + "Name: question answering\nType: TASK_OR_PROBLEM", + "Name: arxiv preprint\nType: FILE_TYPE", + "Name: xiaoxin he\nType: PERSON", + "Name: yijun tian\nType: PERSON", + "Name: xavier bresson\nType: PERSON", + "Name: textual graph understanding\nType: TASK_OR_PROBLEM", + "Name: g retriever\nType: MODEL_OR_ARCHITECTURE", + "Name: yifei sun\nType: PERSON", + "Name: arxiv 2402 07630\nType: PUBLICATION_VENUE", + "Name: nitesh v chawla\nType: PERSON", + "Name: arxiv\nType: PUBLICATION_VENUE", + "Name: bryan hooi\nType: PERSON", + "Name: thomas laurent\nType: PERSON" + ], + "212": [ + "Name: retrieval augmented language model\nType: MODEL_OR_ARCHITECTURE", + "Name: natural language processing\nType: RESEARCH_FIELD", + "Name: arxiv 2404 19543\nType: PRODUCT", + "Name: 2024\nType: DATE", + "Name: rag and rau a survey on retrieval augmented language model in natural language processing\nType: BOOK", + "Name: yucheng hu\nType: PERSON", + "Name: yuxing lu\nType: PERSON", + "Name: arxiv\nType: PUBLICATION_VENUE" + ], + "213": [ + "Name: retrieval augmented large language models\nType: MODEL_OR_ARCHITECTURE", + "Name: soyeong jeong\nType: PERSON", + "Name: 2024\nType: DATE", + "Name: et al\nType: PERSON", + "Name: adaptive rag\nType: MODEL_OR_ARCHITECTURE", + "Name: arxiv 2403 14403\nType: PUBLICATION_VENUE", + "Name: learning\nType: METHOD_OR_TECHNIQUE", + "Name: question complexity\nType: TASK_OR_PROBLEM", + "Name: arxiv\nType: PUBLICATION_VENUE", + "Name: jinheon baek\nType: PERSON" + ], + "214": [ + "Name: 13\nType: NUMBER" + ], + "215": [ + "Name: table: node 215...\nType: TABLE" + ], + "216": [ + "Name: maria lomeli\nType: PERSON", + "Name: timo schick\nType: PERSON", + "Name: nicola cancedda\nType: PERSON", + "Name: 2024\nType: DATE", + "Name: roberto dess\nType: PERSON", + "Name: luke zettlemoyer\nType: PERSON", + "Name: eric hambro\nType: PERSON", + "Name: jane dwivedi yu\nType: PERSON", + "Name: roberta raileanu\nType: PERSON", + "Name: thomas scialom\nType: PERSON" + ], + "217": [ + "Name: table: node 217...\nType: TABLE" + ], + "218": [ + "Name: table: node 218...\nType: TABLE" + ], + "219": [ + "Name: 14\nType: MEASUREMENT" + ], + "220": [ + "Name: a experimental details\nType: SECTION_TITLE" + ], + "221": [ + "Name: accuracy\nType: EVALUATION_METRIC", + "Name: a.1 evaluation metrics\nType: SECTION_TITLE" + ], + "222": [ + "Name: calculation procedures\nType: METHOD_OR_TECHNIQUE", + "Name: definitions\nType: CONCEPT", + "Name: main experiments\nType: EVENT", + "Name: metrics\nType: EVALUATION_METRIC" + ], + "223": [ + "Name: ground truth labels\nType: PRODUCT", + "Name: natural language responses\nType: PRODUCT", + "Name: the answer is\nType: PRODUCT", + "Name: a 1 1 answer extraction and normalization\nType: SECTION_TITLE", + "Name: standard rag models\nType: TECHNOLOGY", + "Name: 12 5\nType: MEASUREMENT", + "Name: option a\nType: PRODUCT" + ], + "224": [ + "Name: key information\nType: CONCEPT", + "Name: y hat\nType: PARAMETER_OR_VARIABLE", + "Name: span extraction\nType: TASK_OR_PROBLEM", + "Name: removing punctuation\nType: METHOD_OR_TECHNIQUE", + "Name: ground truth\nType: CONCEPT", + "Name: official evaluation protocols\nType: TASK_OR_PROBLEM", + "Name: lowercasing\nType: METHOD_OR_TECHNIQUE", + "Name: y gold\nType: PARAMETER_OR_VARIABLE", + "Name: n\nType: METHOD_OR_TECHNIQUE", + "Name: equation 16\nType: EQUATION_OR_FORMULA", + "Name: llm based extraction step\nType: METHOD_OR_TECHNIQUE", + "Name: y raw\nType: PARAMETER_OR_VARIABLE", + "Name: instruction\nType: PARAMETER_OR_VARIABLE", + "Name: key entity\nType: CONCEPT", + "Name: llmextract\nType: SOFTWARE", + "Name: rag system\nType: SYSTEM" + ], + "225": [ + "Name: formula (16)\nType: EQUATION_OR_FORMULA" + ], + "226": [ + "Name: ground truth (y_gold)\nType: PARAMETER_OR_VARIABLE", + "Name: substring inclusion relation\nType: METHOD_OR_TECHNIQUE", + "Name: qa performance metrics\nType: EVALUATION_METRIC", + "Name: a.1.2 qa performance metrics\nType: SECTION_TITLE", + "Name: model response (y_raw)\nType: PARAMETER_OR_VARIABLE", + "Name: accuracy\nType: EVALUATION_METRIC" + ], + "227": [ + "Name: 3\nType: PUBLICATION_VENUE", + "Name: 34\nType: PUBLICATION_VENUE", + "Name: model s generated response\nType: PRODUCT", + "Name: prior works\nType: PUBLICATION_VENUE", + "Name: normalized gold answer\nType: DATASET_OR_CORPUS", + "Name: strict exact match\nType: EVALUATION_METRIC", + "Name: soft match metric\nType: EVALUATION_METRIC", + "Name: accuracy inclusion based\nType: EVALUATION_METRIC", + "Name: 46\nType: PUBLICATION_VENUE", + "Name: llm\nType: TECHNOLOGY" + ], + "228": [ + "Name: formula (17)\nType: EQUATION_OR_FORMULA" + ], + "229": [ + "Name: accuracy\nType: EVALUATION_METRIC", + "Name: exact match\nType: EVALUATION_METRIC" + ], + "230": [ + "Name: formula (18)\nType: EQUATION_OR_FORMULA" + ], + "231": [ + "Name: token level f1 score\nType: EVALUATION_METRIC", + "Name: p\nType: PARAMETER_OR_VARIABLE", + "Name: f1 score\nType: EVALUATION_METRIC", + "Name: equation 19\nType: EQUATION_OR_FORMULA", + "Name: f1\nType: PARAMETER_OR_VARIABLE", + "Name: r\nType: PARAMETER_OR_VARIABLE" + ], + "232": [ + "Name: formula (19)\nType: EQUATION_OR_FORMULA" + ], + "233": [ + "Name: 15\nType: MEASUREMENT" + ], + "234": [ + "Name: retrieval quality\nType: EVALUATION_METRIC", + "Name: a.1.3 retrieval recall\nType: SECTION_TITLE", + "Name: b_ret\nType: PARAMETER_OR_VARIABLE", + "Name: recall_ret\nType: EVALUATION_METRIC", + "Name: pdf blocks\nType: DATASET_OR_CORPUS", + "Name: query q\nType: PARAMETER_OR_VARIABLE", + "Name: b_gold\nType: PARAMETER_OR_VARIABLE" + ], + "235": [ + "Name: formula (20)\nType: EQUATION_OR_FORMULA" + ], + "236": [ + "Name: recall\nType: EVALUATION_METRIC", + "Name: ground truth block\nType: TASK_OR_PROBLEM", + "Name: 0\nType: NUMBER", + "Name: candidate pool\nType: DATASET_OR_CORPUS", + "Name: pdf\nType: FILE_TYPE" + ], + "237": [ + "Name: a.2 implementation details\nType: SECTION_TITLE" + ], + "238": [ + "Name: robust document layout parsing\nType: TASK_OR_PROBLEM", + "Name: 1024gb\nType: MEASUREMENT", + "Name: fair comparison\nType: CONCEPT", + "Name: sam234990\nType: PERSON", + "Name: source code\nType: PRODUCT", + "Name: qwen family\nType: MODEL_OR_ARCHITECTURE", + "Name: ground truth images\nType: IMAGE", + "Name: https github com sam234990 bookrag\nType: LOCATION", + "Name: vlm\nType: MODEL_OR_ARCHITECTURE", + "Name: reference 63\nType: PUBLICATION_VENUE", + "Name: 10\nType: MEASUREMENT", + "Name: 500 tokens\nType: MEASUREMENT", + "Name: gme qwen2 vl 2b instruct\nType: MODEL_OR_ARCHITECTURE", + "Name: 24 gb\nType: MEASUREMENT", + "Name: python\nType: PROGRAMMING_LANGUAGE", + "Name: baseline methods\nType: TASK_OR_PROBLEM", + "Name: linux\nType: SOFTWARE", + "Name: performance deficits\nType: CONCEPT", + "Name: bookrag\nType: PRODUCT", + "Name: retrieval ranking\nType: METHOD_OR_TECHNIQUE", + "Name: reranking\nType: TASK_OR_PROBLEM", + "Name: reference 52\nType: PUBLICATION_VENUE", + "Name: github repository\nType: LOCATION", + "Name: candidate pool\nType: TASK_OR_PROBLEM", + "Name: document chunking\nType: METHOD_OR_TECHNIQUE", + "Name: multi modal embedding\nType: TASK_OR_PROBLEM", + "Name: reference 4\nType: PUBLICATION_VENUE", + "Name: 8b counterpart\nType: MEASUREMENT", + "Name: intel xeon 2 0ghz cpu\nType: HARDWARE", + "Name: llm\nType: MODEL_OR_ARCHITECTURE", + "Name: qwen3 embedding 0 6b\nType: MODEL_OR_ARCHITECTURE", + "Name: high performance server\nType: LOCATION", + "Name: implementation configurations\nType: PRODUCT", + "Name: qwen3 reranker 4b\nType: MODEL_OR_ARCHITECTURE", + "Name: nvidia geforce rtx a5000\nType: HARDWARE", + "Name: reference 64\nType: PUBLICATION_VENUE", + "Name: mineru\nType: SOFTWARE", + "Name: text embedding\nType: TASK_OR_PROBLEM", + "Name: efficiency\nType: CONCEPT", + "Name: effectiveness\nType: CONCEPT", + "Name: reproducibility\nType: CONCEPT", + "Name: embedding models\nType: MODEL_OR_ARCHITECTURE", + "Name: qwen2 5vl 30b\nType: MODEL_OR_ARCHITECTURE", + "Name: 10b parameter scale\nType: MEASUREMENT", + "Name: reference 60\nType: PUBLICATION_VENUE", + "Name: qwen3 8b\nType: MODEL_OR_ARCHITECTURE", + "Name: 30b version\nType: MEASUREMENT", + "Name: sequential processing mode\nType: TASK_OR_PROBLEM" + ], + "239": [ + "Name: prompts\nType: METHOD_OR_TECHNIQUE", + "Name: a.3 prompts\nType: SECTION_TITLE" + ], + "240": [ + "Name: agent based query classification\nType: TASK_OR_PROBLEM", + "Name: figure 10\nType: IMAGE", + "Name: figure 11\nType: IMAGE", + "Name: prompts\nType: PRODUCT", + "Name: figure 12\nType: IMAGE", + "Name: filter operator generation\nType: TASK_OR_PROBLEM", + "Name: question decomposition\nType: TASK_OR_PROBLEM", + "Name: graph construction phase\nType: TASK_OR_PROBLEM", + "Name: figure 13\nType: IMAGE", + "Name: entity resolution judgment\nType: TASK_OR_PROBLEM" + ], + "241": [ + "Name: complex\nType: TASK_OR_PROBLEM", + "Name: global\nType: TASK_OR_PROBLEM", + "Name: simple\nType: TASK_OR_PROBLEM", + "Name: json object\nType: FILE_TYPE", + "Name: expert query analyzer\nType: PERSON", + "Name: user\nType: PERSON" + ], + "242": [ + "Name: category definitions\nType: SECTION_TITLE" + ], + "243": [ + "Name: table\nType: SECTION_TITLE", + "Name: contiguous location\nType: UNKNOWN", + "Name: single hop\nType: TASK_OR_PROBLEM", + "Name: single\nType: UNKNOWN", + "Name: information\nType: CONCEPT", + "Name: document\nType: CONCEPT", + "Name: question\nType: TASK_OR_PROBLEM", + "Name: paragraph\nType: SECTION_TITLE", + "Name: figure\nType: SECTION_TITLE" + ], + "245": [ + "Name: figure 2\nType: IMAGE" + ], + "246": [ + "Name: latinos\nType: NATIONALITY", + "Name: economic upward mobility\nType: TASK_OR_PROBLEM", + "Name: children\nType: PERSON", + "Name: 5\nType: PERCENTAGE" + ], + "247": [ + "Name: multi hop\nType: TASK_OR_PROBLEM" + ], + "249": [ + "Name: personality vector\nType: TASK_OR_PROBLEM" + ], + "250": [ + "Name: aggregation operation\nType: UNKNOWN", + "Name: global\nType: TASK_OR_PROBLEM", + "Name: items\nType: UNKNOWN", + "Name: structural filter\nType: METHOD_OR_TECHNIQUE", + "Name: counting\nType: METHOD_OR_TECHNIQUE", + "Name: listing\nType: METHOD_OR_TECHNIQUE", + "Name: summarizing\nType: METHOD_OR_TECHNIQUE" + ], + "251": [ + "Name: example\nType: TASK_OR_PROBLEM", + "Name: table\nType: PRODUCT", + "Name: global\nType: CONCEPT" + ], + "252": [ + "Name: user query\nType: TASK_OR_PROBLEM" + ], + "253": [ + "Name: figure 10\nType: IMAGE", + "Name: query classification\nType: TASK_OR_PROBLEM" + ], + "254": [ + "Name: 16\nType: MEASUREMENT" + ], + "255": [ + "Name: user a2gbifl43u1lkj\nType: PERSON", + "Name: type\nType: SECTION_TITLE", + "Name: personality vector\nType: PRODUCT", + "Name: complex question\nType: TASK_OR_PROBLEM", + "Name: sub questions\nType: SECTION_TITLE", + "Name: example 2\nType: EVENT", + "Name: retrieval sub question\nType: TASK_OR_PROBLEM", + "Name: json object\nType: FILE_TYPE", + "Name: color\nType: COLOR", + "Name: soft labeled personality embedding matrix\nType: PRODUCT", + "Name: user query\nType: TASK_OR_PROBLEM", + "Name: query decomposition expert\nType: PROFESSION", + "Name: synthesis question\nType: TASK_OR_PROBLEM", + "Name: report\nType: BOOK", + "Name: population\nType: MEASUREMENT", + "Name: latinos interviewed by cellphone\nType: PERSON", + "Name: question\nType: SECTION_TITLE", + "Name: receptiviti score\nType: EVALUATION_METRIC", + "Name: example 1\nType: EVENT", + "Name: survey\nType: EVENT", + "Name: foreign born latinos\nType: PERSON", + "Name: simple atomic sub questions\nType: TASK_OR_PROBLEM" + ], + "256": [ + "Name: figure 11\nType: IMAGE", + "Name: query decomposition\nType: TASK_OR_PROBLEM" + ], + "257": [ + "Name: 17\nType: NUMBER" + ], + "258": [ + "Name: 3\nType: MEASUREMENT", + "Name: data augmentation\nType: METHOD_OR_TECHNIQUE", + "Name: methodology\nType: SECTION_TITLE", + "Name: figures\nType: IMAGE", + "Name: user\nType: PERSON", + "Name: appendices\nType: SECTION_TITLE", + "Name: count\nType: TASK_OR_PROBLEM", + "Name: 10\nType: MEASUREMENT", + "Name: assistant\nType: PERSON", + "Name: summarize\nType: TASK_OR_PROBLEM", + "Name: section\nType: SECTION_TITLE", + "Name: null\nType: TASK_OR_PROBLEM", + "Name: json object\nType: FILE_TYPE", + "Name: filters\nType: TASK_OR_PROBLEM", + "Name: table\nType: TABLE", + "Name: chapter\nType: SECTION_TITLE", + "Name: 3 10\nType: MEASUREMENT", + "Name: list\nType: TASK_OR_PROBLEM", + "Name: global query\nType: TASK_OR_PROBLEM", + "Name: page\nType: MEASUREMENT", + "Name: discussion\nType: TASK_OR_PROBLEM", + "Name: ai assistant\nType: PERSON", + "Name: image\nType: IMAGE", + "Name: report\nType: BOOK", + "Name: references\nType: SECTION_TITLE", + "Name: operation\nType: TASK_OR_PROBLEM", + "Name: paper\nType: BOOK", + "Name: analyze\nType: TASK_OR_PROBLEM" + ], + "259": [ + "Name: figure 12\nType: IMAGE", + "Name: filter operator generation\nType: TASK_OR_PROBLEM" + ], + "260": [ + "Name: 18\nType: NUMBER" + ], + "262": [ + "Name: id\nType: PARAMETER_OR_VARIABLE", + "Name: candidate entities\nType: TASK_OR_PROBLEM", + "Name: new entity\nType: TASK_OR_PROBLEM", + "Name: knowledge graph\nType: TASK_OR_PROBLEM", + "Name: 1\nType: VALUE", + "Name: json object\nType: FILE_TYPE", + "Name: knowledge base\nType: TASK_OR_PROBLEM", + "Name: entity resolution adjudicator\nType: PERSON", + "Name: text\nType: DATASET_OR_CORPUS", + "Name: explanation\nType: TASK_OR_PROBLEM" + ], + "265": [ + "Name: new entity\nType: TASK_OR_PROBLEM" + ], + "266": [ + "Name: field by field adjudication\nType: TASK_OR_PROBLEM" + ], + "267": [ + "Name: large language model\nType: TECHNOLOGY", + "Name: distinct concepts\nType: CONCEPT", + "Name: alias\nType: CONCEPT", + "Name: high importance\nType: CONCEPT", + "Name: event detection\nType: TASK_OR_PROBLEM", + "Name: entity name\nType: TASK_OR_PROBLEM", + "Name: llm\nType: TECHNOLOGY", + "Name: named entity recognition\nType: TASK_OR_PROBLEM" + ], + "268": [ + "Name: entity type\nType: TASK_OR_PROBLEM" + ], + "269": [ + "Name: contextual importance\nType: CONCEPT", + "Name: description\nType: CONCEPT" + ], + "270": [ + "Name: be strict and conservative\nType: TASK_OR_PROBLEM" + ], + "272": [ + "Name: apple inc\nType: ORGANIZATION", + "Name: apple\nType: PRODUCT" + ], + "273": [ + "Name: when in doubt\nType: TASK_OR_PROBLEM", + "Name: 1\nType: UNKNOWN" + ], + "274": [ + "Name: new entity\nType: TASK_OR_PROBLEM" + ], + "275": [ + "Name: output\nType: UNKNOWN", + "Name: json\nType: FILE_TYPE" + ], + "276": [ + "Name: id\nType: PARAMETER_OR_VARIABLE", + "Name: exact match\nType: TASK_OR_PROBLEM", + "Name: select id\nType: PARAMETER_OR_VARIABLE", + "Name: 1\nType: MONEY", + "Name: integer\nType: MEASUREMENT", + "Name: candidate\nType: TASK_OR_PROBLEM" + ], + "277": [ + "Name: explanation\nType: TASK_OR_PROBLEM" + ], + "281": [ + "Name: select id\nType: PARAMETER_OR_VARIABLE", + "Name: example 2\nType: TASK_OR_PROBLEM", + "Name: example 1\nType: TASK_OR_PROBLEM", + "Name: explanation\nType: PARAMETER_OR_VARIABLE" + ], + "282": [ + "Name: integer\nType: MEASUREMENT", + "Name: selection task\nType: TASK_OR_PROBLEM" + ], + "284": [ + "Name: figure 13\nType: IMAGE", + "Name: examples\nType: DATASET_OR_CORPUS", + "Name: entity resolution\nType: TASK_OR_PROBLEM", + "Name: prompt\nType: SOFTWARE" + ], + "285": [ + "Name: 19\nType: NUMBER" + ] + }, + "variant": "basic" +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_1.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_1.json new file mode 100644 index 0000000..40079dc --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_1.json @@ -0,0 +1,87 @@ +{ + "entities": [ + { + "entity_name": "bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents", + "entity_type": "SECTION_TITLE", + "description": "As the primary title of the document, this section introduces BookRAG, a novel approach designed to handle complex documents by utilizing hierarchical structure awareness and index-based mechanisms within a Retrieval-Augmented Generation framework.", + "source_ids": [ + 1 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The specific name of the proposed model or architecture introduced in the document.", + "source_ids": [ + 1 + ] + }, + { + "entity_name": "hierarchical structure-aware index-based approach", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The core methodology employed by BookRAG, focusing on leveraging document hierarchy and indexing strategies.", + "source_ids": [ + 1 + ] + }, + { + "entity_name": "retrieval-augmented generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "The broader AI task domain addressed by the proposed approach, involving combining retrieval systems with generative models.", + "source_ids": [ + 1 + ] + }, + { + "entity_name": "complex documents", + "entity_type": "DATASET_OR_CORPUS", + "description": "The target data type or corpus category that the system is specifically designed to process.", + "source_ids": [ + 1 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'BookRAG' is the primary subject defined in the main title.", + "source_ids": [ + 1 + ] + }, + { + "src_entity_name": "hierarchical structure-aware index-based approach", + "tgt_entity_name": "bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents", + "relation_name": "", + "weight": 10.0, + "description": "The methodological approach is a key component described in the main title.", + "source_ids": [ + 1 + ] + }, + { + "src_entity_name": "retrieval-augmented generation", + "tgt_entity_name": "bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents", + "relation_name": "", + "weight": 10.0, + "description": "The application domain or task is a central theme of the main title.", + "source_ids": [ + 1 + ] + }, + { + "src_entity_name": "complex documents", + "tgt_entity_name": "bookrag: a hierarchical structure-aware index-based approach for retrieval-augmented generation on complex documents", + "relation_name": "", + "weight": 10.0, + "description": "The target data scope is explicitly mentioned as a focus area in the main title.", + "source_ids": [ + 1 + ] + } + ], + "node_idx": 1 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_10.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_10.json new file mode 100644 index 0000000..930f4c0 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_10.json @@ -0,0 +1,87 @@ +{ + "entities": [ + { + "entity_name": "creative commons by nc nd 4 0 international license", + "entity_type": "LAW", + "description": "creative commons by nc nd 4 0 international license is the specific license under which this work is distributed", + "source_ids": [ + 10 + ] + }, + { + "entity_name": "vldb endowment", + "entity_type": "ORGANIZATION", + "description": "vldb endowment is the organization that holds the publication rights for this work", + "source_ids": [ + 10 + ] + }, + { + "entity_name": "info vldb org", + "entity_type": "EMAIL", + "description": "info vldb org is the email address provided for obtaining permission for uses beyond the license", + "source_ids": [ + 10 + ] + }, + { + "entity_name": "creative commons", + "entity_type": "ORGANIZATION", + "description": "creative commons is the organization that created the by nc nd 4 0 international license", + "source_ids": [ + 10 + ] + }, + { + "entity_name": "owner author s", + "entity_type": "PERSON", + "description": "owner author s refers to the individuals or entities holding the copyright for the work", + "source_ids": [ + 10 + ] + } + ], + "relations": [ + { + "src_entity_name": "creative commons by nc nd 4 0 international license", + "tgt_entity_name": "vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "the work licensed under the creative commons by nc nd 4 0 international license has its publication rights licensed to the vldb endowment", + "source_ids": [ + 10 + ] + }, + { + "src_entity_name": "creative commons", + "tgt_entity_name": "creative commons by nc nd 4 0 international license", + "relation_name": "", + "weight": 9.0, + "description": "creative commons is the creator of the by nc nd 4 0 international license", + "source_ids": [ + 10 + ] + }, + { + "src_entity_name": "owner author s", + "tgt_entity_name": "creative commons by nc nd 4 0 international license", + "relation_name": "", + "weight": 8.0, + "description": "the owner author s hold the copyright for the work which is licensed under the creative commons by nc nd 4 0 international license", + "source_ids": [ + 10 + ] + }, + { + "src_entity_name": "owner author s", + "tgt_entity_name": "vldb endowment", + "relation_name": "", + "weight": 7.0, + "description": "the owner author s hold the copyright while the vldb endowment is licensed the publication rights", + "source_ids": [ + 10 + ] + } + ], + "node_idx": 10 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_100.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_100.json new file mode 100644 index 0000000..0c16f6f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_100.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (3)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the output of an LLM function as a set of elements. LaTeX: 𝐸 𝑞 = LLM ( 𝑃 𝐸𝑥𝑡 , 𝑞 ) = { 𝑒 1 , 𝑒 2 , . . . , 𝑒 𝑚 } (3)", + "source_ids": [ + 100 + ] + } + ], + "relations": [], + "node_idx": 100 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_101.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_101.json new file mode 100644 index 0000000..ee92740 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_101.json @@ -0,0 +1,143 @@ +{ + "entities": [ + { + "entity_name": "q", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "q is the original user query mentioned in the text", + "source_ids": [ + 101 + ] + }, + { + "entity_name": "p dec", + "entity_type": "SOFTWARE", + "description": "p dec represents a prompt used to guide the llm for the decomposition task", + "source_ids": [ + 101 + ] + }, + { + "entity_name": "p ext", + "entity_type": "SOFTWARE", + "description": "p ext represents a prompt used to guide the llm for the extraction task", + "source_ids": [ + 101 + ] + }, + { + "entity_name": "llm", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "llm is the model being guided by the prompts for decomposition and extraction tasks", + "source_ids": [ + 101 + ] + }, + { + "entity_name": "decomposition", + "entity_type": "TASK_OR_PROBLEM", + "description": "decomposition is a task for which the prompt p dec is used to guide the llm", + "source_ids": [ + 101 + ] + }, + { + "entity_name": "extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "extraction is a task for which the prompt p ext is used to guide the llm", + "source_ids": [ + 101 + ] + }, + { + "entity_name": "prompt", + "entity_type": "SOFTWARE", + "description": "prompts are instructions used to guide the llm for specific tasks", + "source_ids": [ + 101 + ] + } + ], + "relations": [ + { + "src_entity_name": "p dec", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "p dec is used to guide the llm for the decomposition task", + "source_ids": [ + 101 + ] + }, + { + "src_entity_name": "p ext", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "p ext is used to guide the llm for the extraction task", + "source_ids": [ + 101 + ] + }, + { + "src_entity_name": "q", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 7.0, + "description": "q is the original user query that the llm processes", + "source_ids": [ + 101 + ] + }, + { + "src_entity_name": "p dec", + "tgt_entity_name": "decomposition", + "relation_name": "", + "weight": 10.0, + "description": "p dec is the specific prompt used to guide the llm for the decomposition task", + "source_ids": [ + 101 + ] + }, + { + "src_entity_name": "p ext", + "tgt_entity_name": "extraction", + "relation_name": "", + "weight": 10.0, + "description": "p ext is the specific prompt used to guide the llm for the extraction task", + "source_ids": [ + 101 + ] + }, + { + "src_entity_name": "prompt", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "prompts are used to guide the llm", + "source_ids": [ + 101 + ] + }, + { + "src_entity_name": "decomposition", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "the llm performs the decomposition task", + "source_ids": [ + 101 + ] + }, + { + "src_entity_name": "extraction", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "the llm performs the extraction task", + "source_ids": [ + 101 + ] + } + ], + "node_idx": 101 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_102.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_102.json new file mode 100644 index 0000000..2a3b92e --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_102.json @@ -0,0 +1,297 @@ +{ + "entities": [ + { + "entity_name": "selector", + "entity_type": "TECHNOLOGY", + "description": "selector is an operator that filters or selects specific content ranges from the bookindex", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "bookindex is the source of content ranges that the selector operators filter", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "filter modal", + "entity_type": "TECHNOLOGY", + "description": "filter modal is an operator that applies explicit constraints to the bookindex", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "filter range", + "entity_type": "TECHNOLOGY", + "description": "filter range is an operator that applies explicit constraints to the bookindex", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "c", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "c represents explicit constraints such as modal types and page ranges generated during a plan", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "tree", + "entity_type": "TASK_OR_PROBLEM", + "description": "tree is the data structure t n e t on which the operators operate", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "n represents the set of nodes in the tree", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "e t", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "e t represents the set of edges in the tree", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "n f", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "n f is the filtered subset of nodes produced by the operators", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "c n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "c n is a predicate that holds true for each node in the filtered subset", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "modal types", + "entity_type": "CONCEPT", + "description": "modal types are a specific type of explicit constraint c mentioned in the text", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "page ranges", + "entity_type": "CONCEPT", + "description": "page ranges are a specific type of explicit constraint c mentioned in the text", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "plan", + "entity_type": "TASK_OR_PROBLEM", + "description": "the plan is the process during which explicit constraints c are generated", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "nodes", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "nodes are the individual elements within the tree t that are evaluated by the predicate", + "source_ids": [ + 102 + ] + }, + { + "entity_name": "edges", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "edges are the connections within the tree t denoted as e t", + "source_ids": [ + 102 + ] + } + ], + "relations": [ + { + "src_entity_name": "selector", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "selector operators filter content ranges directly from the bookindex", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "filter modal", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 8.0, + "description": "filter modal applies the explicit constraints c generated during the plan", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "filter range", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 8.0, + "description": "filter range applies the explicit constraints c generated during the plan", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "tree", + "relation_name": "", + "weight": 9.0, + "description": "the selector operators operate on the tree t n e t to produce a filtered subset", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "filter modal", + "tgt_entity_name": "tree", + "relation_name": "", + "weight": 8.0, + "description": "filter modal operates on the tree to produce a filtered subset", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "filter range", + "tgt_entity_name": "tree", + "relation_name": "", + "weight": 8.0, + "description": "filter range operates on the tree to produce a filtered subset", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "n f", + "relation_name": "", + "weight": 9.0, + "description": "the selector operators produce the filtered subset n f", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "filter modal", + "tgt_entity_name": "n f", + "relation_name": "", + "weight": 8.0, + "description": "filter modal contributes to the production of the filtered subset n f", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "filter range", + "tgt_entity_name": "n f", + "relation_name": "", + "weight": 8.0, + "description": "filter range contributes to the production of the filtered subset n f", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "n f", + "tgt_entity_name": "c n", + "relation_name": "", + "weight": 9.0, + "description": "the filtered subset n f consists of nodes where the predicate c n holds true", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "c", + "tgt_entity_name": "modal types", + "relation_name": "", + "weight": 10.0, + "description": "modal types are examples of the explicit constraints c", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "c", + "tgt_entity_name": "page ranges", + "relation_name": "", + "weight": 10.0, + "description": "page ranges are examples of the explicit constraints c", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "c", + "tgt_entity_name": "plan", + "relation_name": "", + "weight": 9.0, + "description": "the constraints c are generated during the plan", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 10.0, + "description": "the tree t is composed of the set of nodes n", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "edges", + "relation_name": "", + "weight": 10.0, + "description": "the tree t is composed of the set of edges e t", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "n f", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "the filtered subset n f is a subset of the nodes n", + "source_ids": [ + 102 + ] + }, + { + "src_entity_name": "c n", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "the predicate c n is evaluated for each node in the set", + "source_ids": [ + 102 + ] + } + ], + "node_idx": 102 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_103.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_103.json new file mode 100644 index 0000000..f5e08a2 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_103.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (4)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the set N_f as a subset of N based on condition C. LaTeX: 𝑁 𝑓 = { 𝑛 ∈ 𝑁 | 𝐶 𝑛 ( )} (4)", + "source_ids": [ + 103 + ] + } + ], + "relations": [], + "node_idx": 103 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_104.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_104.json new file mode 100644 index 0000000..8c88080 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_104.json @@ -0,0 +1,251 @@ +{ + "entities": [ + { + "entity_name": "select by entity", + "entity_type": "TECHNOLOGY", + "description": "select by entity is a method that targets contiguous document segments by retrieving subtrees rooted at specific section nodes", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "select by section", + "entity_type": "TECHNOLOGY", + "description": "select by section is a method that targets contiguous document segments by retrieving subtrees rooted at specific section nodes", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "gt link", + "entity_type": "TECHNOLOGY", + "description": "gt link is a mechanism used to link sections to entities", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "llm", + "entity_type": "TECHNOLOGY", + "description": "llm is a system used to select sections in the described process", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "s target", + "entity_type": "TASK_OR_PROBLEM", + "description": "s target represents a set of target section nodes at a specified depth", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "n", + "entity_type": "TASK_OR_PROBLEM", + "description": "n represents the set of nodes in the document structure", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "e q", + "entity_type": "TASK_OR_PROBLEM", + "description": "e q represents the entities linked to sections via gt link", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "n s", + "entity_type": "TASK_OR_PROBLEM", + "description": "n s represents the selected node set formed by retrieving descendants of target sections", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "document", + "entity_type": "TASK_OR_PROBLEM", + "description": "document is the text being processed by the select by entity and select by section methods", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "subtree", + "entity_type": "TASK_OR_PROBLEM", + "description": "subtree refers to the data structure rooted at specific section nodes that is retrieved by the methods", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "section node", + "entity_type": "TASK_OR_PROBLEM", + "description": "section node is a specific node within the document structure that serves as a root for subtrees", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "depth", + "entity_type": "MEASUREMENT", + "description": "depth is a specified parameter determining the level of the target section nodes", + "source_ids": [ + 104 + ] + }, + { + "entity_name": "descendant", + "entity_type": "TASK_OR_PROBLEM", + "description": "descendant refers to the nodes below the target section nodes that are retrieved to form the selected node set", + "source_ids": [ + 104 + ] + } + ], + "relations": [ + { + "src_entity_name": "select by entity", + "tgt_entity_name": "s target", + "relation_name": "", + "weight": 9.0, + "description": "select by entity identifies a set of target section nodes s target as part of its process", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "select by section", + "tgt_entity_name": "s target", + "relation_name": "", + "weight": 9.0, + "description": "select by section identifies a set of target section nodes s target as part of its process", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "s target", + "tgt_entity_name": "e q", + "relation_name": "", + "weight": 8.0, + "description": "s target consists of sections linked to entities e q via gt link", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "s target", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "s target includes sections selected by the llm", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "s target", + "tgt_entity_name": "n s", + "relation_name": "", + "weight": 9.0, + "description": "n s is formed by retrieving all descendants of the target section nodes s target", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "gt link", + "tgt_entity_name": "e q", + "relation_name": "", + "weight": 9.0, + "description": "gt link is the mechanism used to link sections to entities e q", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "select by entity", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "select by entity targets contiguous segments within the document", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "select by section", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "select by section targets contiguous segments within the document", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "select by entity", + "tgt_entity_name": "subtree", + "relation_name": "", + "weight": 9.0, + "description": "select by entity retrieves subtrees rooted at specific section nodes", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "select by section", + "tgt_entity_name": "subtree", + "relation_name": "", + "weight": 9.0, + "description": "select by section retrieves subtrees rooted at specific section nodes", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "s target", + "tgt_entity_name": "section node", + "relation_name": "", + "weight": 10.0, + "description": "s target consists of specific section nodes", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "s target", + "tgt_entity_name": "depth", + "relation_name": "", + "weight": 8.0, + "description": "s target is defined at a specified depth", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "n s", + "tgt_entity_name": "descendant", + "relation_name": "", + "weight": 9.0, + "description": "n s is formed by retrieving all descendants of the target sections", + "source_ids": [ + 104 + ] + }, + { + "src_entity_name": "gt link", + "tgt_entity_name": "section node", + "relation_name": "", + "weight": 8.0, + "description": "gt link links sections nodes to entities", + "source_ids": [ + 104 + ] + } + ], + "node_idx": 104 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_105.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_105.json new file mode 100644 index 0000000..6a7911d --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_105.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (5)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable N_s as a glyph value for an element s in set S within a target subtree. LaTeX: 𝑁 𝑠 = GLYPH<216> 𝑠 ∈ 𝑆 target Subtree ( 𝑠 ) (5)", + "source_ids": [ + 105 + ] + } + ], + "relations": [], + "node_idx": 105 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_106.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_106.json new file mode 100644 index 0000000..240a5ce --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_106.json @@ -0,0 +1,191 @@ +{ + "entities": [ + { + "entity_name": "reasoner", + "entity_type": "TASK_OR_PROBLEM", + "description": "reasoner is described as a component that analyzes and refines selected tree nodes", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "graph reasoning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "graph reasoning is a method that performs multi hop inference on a subgraph starting from an entity", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "pagerank algorithm", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "pagerank algorithm is used to compute an entity importance vector over a subgraph", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "gt link matrix", + "entity_type": "SOFTWARE", + "description": "gt link matrix is a matrix used to map entity scores to tree nodes to derive importance scores", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "entity importance vector", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "entity importance vector is a vector computed over a subgraph representing the importance of entities", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "tree node importance scores vector", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "tree node importance scores vector is the final vector derived by mapping entity scores to tree nodes", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "subgraph", + "entity_type": "TASK_OR_PROBLEM", + "description": "subgraph is a portion of a graph extracted from selected nodes on which inference is performed", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "entity", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity is the starting point for the multi hop inference process in graph reasoning", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "selected nodes", + "entity_type": "TASK_OR_PROBLEM", + "description": "selected nodes are the nodes from which a subgraph is extracted for graph reasoning", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "20", + "entity_type": "PUBLICATION_VENUE", + "description": "20 is a citation reference associated with the pagerank algorithm", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "6", + "entity_type": "EQUATION_OR_FORMULA", + "description": "6 is the label for the equation defining the entity importance vector", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "7", + "entity_type": "EQUATION_OR_FORMULA", + "description": "7 is the label for the equation defining the tree node importance scores vector", + "source_ids": [ + 106 + ] + }, + { + "entity_name": "selected tree nodes", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 106 + ] + } + ], + "relations": [ + { + "src_entity_name": "reasoner", + "tgt_entity_name": "selected tree nodes", + "relation_name": "", + "weight": 9.0, + "description": "reasoner analyzes and refines selected tree nodes", + "source_ids": [ + 106 + ] + }, + { + "src_entity_name": "graph reasoning", + "tgt_entity_name": "subgraph", + "relation_name": "", + "weight": 10.0, + "description": "graph reasoning performs multi hop inference on a subgraph", + "source_ids": [ + 106 + ] + }, + { + "src_entity_name": "graph reasoning", + "tgt_entity_name": "entity", + "relation_name": "", + "weight": 9.0, + "description": "graph reasoning starts its inference process from an entity", + "source_ids": [ + 106 + ] + }, + { + "src_entity_name": "graph reasoning", + "tgt_entity_name": "pagerank algorithm", + "relation_name": "", + "weight": 10.0, + "description": "graph reasoning uses the pagerank algorithm to compute the entity importance vector", + "source_ids": [ + 106 + ] + }, + { + "src_entity_name": "pagerank algorithm", + "tgt_entity_name": "entity importance vector", + "relation_name": "", + "weight": 10.0, + "description": "the pagerank algorithm computes the entity importance vector", + "source_ids": [ + 106 + ] + }, + { + "src_entity_name": "entity importance vector", + "tgt_entity_name": "gt link matrix", + "relation_name": "", + "weight": 9.0, + "description": "the entity importance vector is mapped to tree nodes via the gt link matrix", + "source_ids": [ + 106 + ] + }, + { + "src_entity_name": "gt link matrix", + "tgt_entity_name": "tree node importance scores vector", + "relation_name": "", + "weight": 9.0, + "description": "the gt link matrix is used to derive the tree node importance scores vector", + "source_ids": [ + 106 + ] + }, + { + "src_entity_name": "subgraph", + "tgt_entity_name": "selected nodes", + "relation_name": "", + "weight": 10.0, + "description": "the subgraph is extracted from selected nodes", + "source_ids": [ + 106 + ] + } + ], + "node_idx": 106 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_107.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_107.json new file mode 100644 index 0000000..b72d060 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_107.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (6)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the PageRank of a graph G with respect to a vector e'. LaTeX: 𝐼 𝐺 = PageRank ( 𝐺 , 𝑒 ' ) (6)", + "source_ids": [ + 107 + ] + } + ], + "relations": [], + "node_idx": 107 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_108.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_108.json new file mode 100644 index 0000000..188f06a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_108.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (7)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the product of S and G as equal to the product of I, G, and M. LaTeX: 𝑆 𝐺 = 𝐼 𝐺 × 𝑀 (7)", + "source_ids": [ + 108 + ] + } + ], + "relations": [], + "node_idx": 108 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_109.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_109.json new file mode 100644 index 0000000..64dfab0 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_109.json @@ -0,0 +1,209 @@ +{ + "entities": [ + { + "entity_name": "text ranker", + "entity_type": "SOFTWARE", + "description": "text ranker is a system that evaluates the semantic relevance of a tree node s content to a query", + "source_ids": [ + 109 + ] + }, + { + "entity_name": "skyline ranker", + "entity_type": "SOFTWARE", + "description": "skyline ranker is a system that employs the skyline operator to filter nodes based on multiple criteria", + "source_ids": [ + 109 + ] + }, + { + "entity_name": "skyline operator", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the skyline operator is a method used by skyline ranker to filter nodes based on scoring dimensions", + "source_ids": [ + 109 + ] + }, + { + "entity_name": "query", + "entity_type": "TASK_OR_PROBLEM", + "description": "the query is the input for which semantic relevance is evaluated by text ranker", + "source_ids": [ + 109 + ] + }, + { + "entity_name": "relevance score", + "entity_type": "EVALUATION_METRIC", + "description": "the relevance score is a metric assigned to each node to indicate its semantic relevance to the query", + "source_ids": [ + 109 + ] + }, + { + "entity_name": "tree node", + "entity_type": "TASK_OR_PROBLEM", + "description": "the tree node is the content unit being evaluated for relevance and filtered based on scoring dimensions", + "source_ids": [ + 109 + ] + }, + { + "entity_name": "nodes", + "entity_type": "TASK_OR_PROBLEM", + "description": "nodes are the data elements being evaluated for relevance and filtered by the ranking systems", + "source_ids": [ + 109 + ] + }, + { + "entity_name": "scoring dimensions", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "scoring dimensions are the specified criteria used to determine if nodes are dominated by others", + "source_ids": [ + 109 + ] + }, + { + "entity_name": "", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 109 + ] + } + ], + "relations": [ + { + "src_entity_name": "text ranker", + "tgt_entity_name": "query", + "relation_name": "", + "weight": 9.0, + "description": "text ranker evaluates the relevance of content specifically to the query", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "text ranker", + "tgt_entity_name": "relevance score", + "relation_name": "", + "weight": 10.0, + "description": "text ranker assigns a relevance score to each tree node", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "text ranker", + "tgt_entity_name": "tree node", + "relation_name": "", + "weight": 9.0, + "description": "text ranker evaluates the content of the tree node", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "skyline operator", + "relation_name": "", + "weight": 10.0, + "description": "skyline ranker employs the skyline operator to perform its filtering function", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "tree node", + "relation_name": "", + "weight": 9.0, + "description": "skyline ranker filters tree nodes based on scoring dimensions", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "relevance score", + "relation_name": "", + "weight": 8.0, + "description": "skyline ranker uses relevance scores along with others to filter nodes", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "tree node", + "relation_name": "", + "weight": 8.0, + "description": "the skyline operator is used to filter tree nodes", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "text ranker", + "tgt_entity_name": "", + "relation_name": "", + "weight": 9.0, + "description": "text ranker uses the query to evaluate semantic relevance", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "text ranker", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "text ranker evaluates the content of the nodes", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "", + "relation_name": "", + "weight": 9.0, + "description": "skyline ranker uses the criterion to filter nodes", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 10.0, + "description": "skyline ranker filters the nodes based on the specified scoring dimensions", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "", + "relation_name": "", + "weight": 8.0, + "description": "the skyline operator utilizes as a scoring dimension", + "source_ids": [ + 109 + ] + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "the skyline operator filters the nodes", + "source_ids": [ + 109 + ] + } + ], + "node_idx": 109 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_11.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_11.json new file mode 100644 index 0000000..7cb4b93 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_11.json @@ -0,0 +1,87 @@ +{ + "entities": [ + { + "entity_name": "proceedings of the vldb endowment", + "entity_type": "PUBLICATION_VENUE", + "description": "proceedings of the vldb endowment is the name of the publication venue mentioned in the text", + "source_ids": [ + 11 + ] + }, + { + "entity_name": "vol 19", + "entity_type": "MEASUREMENT", + "description": "vol 19 refers to the volume number of the publication", + "source_ids": [ + 11 + ] + }, + { + "entity_name": "no 1", + "entity_type": "MEASUREMENT", + "description": "no 1 refers to the issue number of the publication", + "source_ids": [ + 11 + ] + }, + { + "entity_name": "issn 2150 8097", + "entity_type": "MEASUREMENT", + "description": "issn 2150 8097 is the international standard serial number assigned to the publication", + "source_ids": [ + 11 + ] + }, + { + "entity_name": "doi xx xx xxx xx", + "entity_type": "MEASUREMENT", + "description": "doi xx xx xxx xx is the digital object identifier assigned to the document", + "source_ids": [ + 11 + ] + } + ], + "relations": [ + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "vol 19", + "relation_name": "", + "weight": 9.0, + "description": "vol 19 is the volume associated with the proceedings of the vldb endowment", + "source_ids": [ + 11 + ] + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "no 1", + "relation_name": "", + "weight": 9.0, + "description": "no 1 is the issue number associated with the proceedings of the vldb endowment", + "source_ids": [ + 11 + ] + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "issn 2150 8097", + "relation_name": "", + "weight": 10.0, + "description": "issn 2150 8097 is the identifier for the proceedings of the vldb endowment", + "source_ids": [ + 11 + ] + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "doi xx xx xxx xx", + "relation_name": "", + "weight": 9.0, + "description": "doi xx xx xxx xx is the identifier for the specific article within the proceedings of the vldb endowment", + "source_ids": [ + 11 + ] + } + ], + "node_idx": 11 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_110.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_110.json new file mode 100644 index 0000000..064b720 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_110.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "7", + "entity_type": "NUMBER", + "description": "7 is a number mentioned in the text", + "source_ids": [ + 110 + ] + } + ], + "relations": [], + "node_idx": 110 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_111.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_111.json new file mode 100644 index 0000000..e793697 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_111.json @@ -0,0 +1,215 @@ +{ + "entities": [ + { + "entity_name": "synthesizer", + "entity_type": "TASK_OR_PROBLEM", + "description": "synthesizer is described as an operator responsible for content generation", + "source_ids": [ + 111 + ] + }, + { + "entity_name": "map", + "entity_type": "TASK_OR_PROBLEM", + "description": "map is an operator that performs analysis on specific retrieved information segments to generate partial responses", + "source_ids": [ + 111 + ] + }, + { + "entity_name": "reduce", + "entity_type": "TASK_OR_PROBLEM", + "description": "reduce is an operator that synthesizes a final coherent answer by aggregating information from multiple sources", + "source_ids": [ + 111 + ] + }, + { + "entity_name": "content generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "content generation is the primary responsibility of the synthesizer operators mentioned in the text", + "source_ids": [ + 111 + ] + }, + { + "entity_name": "analysis", + "entity_type": "TASK_OR_PROBLEM", + "description": "analysis is the specific action performed by the map operator on retrieved information segments", + "source_ids": [ + 111 + ] + }, + { + "entity_name": "partial responses", + "entity_type": "PRODUCT", + "description": "partial responses are the output generated by the map operator from specific retrieved information segments", + "source_ids": [ + 111 + ] + }, + { + "entity_name": "final coherent answer", + "entity_type": "PRODUCT", + "description": "a final coherent answer is the result synthesized by the reduce operator by aggregating information from multiple sources", + "source_ids": [ + 111 + ] + }, + { + "entity_name": "retrieved information segments", + "entity_type": "DATASET_OR_CORPUS", + "description": "retrieved information segments are the specific data parts that the map operator analyzes", + "source_ids": [ + 111 + ] + }, + { + "entity_name": "multiple sources", + "entity_type": "DATASET_OR_CORPUS", + "description": "multiple sources refer to the various origins of information such as partial answers or retrieved evidence that the reduce operator aggregates", + "source_ids": [ + 111 + ] + }, + { + "entity_name": "partial answers", + "entity_type": "PRODUCT", + "description": "partial answers are one of the types of information collected from multiple sources by the reduce operator", + "source_ids": [ + 111 + ] + }, + { + "entity_name": "retrieved evidence", + "entity_type": "DATASET_OR_CORPUS", + "description": "retrieved evidence is one of the types of information collected from multiple sources by the reduce operator", + "source_ids": [ + 111 + ] + } + ], + "relations": [ + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "map", + "relation_name": "", + "weight": 8.0, + "description": "map is a specific type of operator within the broader category of synthesizer operators responsible for content generation", + "source_ids": [ + 111 + ] + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 8.0, + "description": "reduce is a specific type of operator within the broader category of synthesizer operators responsible for content generation", + "source_ids": [ + 111 + ] + }, + { + "src_entity_name": "map", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 9.0, + "description": "map and reduce are sequential or related steps in the process of generating a final coherent answer with map generating partial responses and reduce aggregating them", + "source_ids": [ + 111 + ] + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "content generation", + "relation_name": "", + "weight": 10.0, + "description": "synthesizer operators are responsible for the task of content generation", + "source_ids": [ + 111 + ] + }, + { + "src_entity_name": "map", + "tgt_entity_name": "analysis", + "relation_name": "", + "weight": 10.0, + "description": "the map operator performs the task of analysis on retrieved information segments", + "source_ids": [ + 111 + ] + }, + { + "src_entity_name": "map", + "tgt_entity_name": "partial responses", + "relation_name": "", + "weight": 10.0, + "description": "map generates partial responses as its output", + "source_ids": [ + 111 + ] + }, + { + "src_entity_name": "map", + "tgt_entity_name": "retrieved information segments", + "relation_name": "", + "weight": 10.0, + "description": "map performs analysis specifically on retrieved information segments", + "source_ids": [ + 111 + ] + }, + { + "src_entity_name": "reduce", + "tgt_entity_name": "final coherent answer", + "relation_name": "", + "weight": 10.0, + "description": "reduce synthesizes a final coherent answer as its output", + "source_ids": [ + 111 + ] + }, + { + "src_entity_name": "reduce", + "tgt_entity_name": "multiple sources", + "relation_name": "", + "weight": 10.0, + "description": "reduce aggregates information from multiple sources to create its output", + "source_ids": [ + 111 + ] + }, + { + "src_entity_name": "reduce", + "tgt_entity_name": "partial answers", + "relation_name": "", + "weight": 9.0, + "description": "reduce aggregates partial answers as part of its synthesis process", + "source_ids": [ + 111 + ] + }, + { + "src_entity_name": "reduce", + "tgt_entity_name": "retrieved evidence", + "relation_name": "", + "weight": 9.0, + "description": "reduce aggregates retrieved evidence as part of its synthesis process", + "source_ids": [ + 111 + ] + }, + { + "src_entity_name": "partial responses", + "tgt_entity_name": "final coherent answer", + "relation_name": "", + "weight": 9.0, + "description": "partial responses generated by map are aggregated by reduce to form the final coherent answer", + "source_ids": [ + 111 + ] + } + ], + "node_idx": 111 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_112.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_112.json new file mode 100644 index 0000000..4173e4d --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_112.json @@ -0,0 +1,225 @@ +{ + "entities": [ + { + "entity_name": "operator plan", + "entity_type": "TASK_OR_PROBLEM", + "description": "operator plan is the final task of an agent to generate an executable plan after classifying a query", + "source_ids": [ + 112 + ] + }, + { + "entity_name": "agent", + "entity_type": "PERSON", + "description": "the agent is an entity that classifies queries and generates executable plans based on the classification", + "source_ids": [ + 112 + ] + }, + { + "entity_name": "query", + "entity_type": "TASK_OR_PROBLEM", + "description": "the query is the input that the agent classifies into a category to generate a plan", + "source_ids": [ + 112 + ] + }, + { + "entity_name": "category", + "entity_type": "TASK_OR_PROBLEM", + "description": "the category is the classification result of the query used by the agent", + "source_ids": [ + 112 + ] + }, + { + "entity_name": "library", + "entity_type": "ORGANIZATION", + "description": "the library is a collection of operators from which the agent selects a sequence", + "source_ids": [ + 112 + ] + }, + { + "entity_name": "operators", + "entity_type": "TASK_OR_PROBLEM", + "description": "operators are the specific sequence elements selected from the library to form the plan", + "source_ids": [ + 112 + ] + }, + { + "entity_name": "parameters", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "parameters are dynamically instantiated based on the query to configure the operators", + "source_ids": [ + 112 + ] + }, + { + "entity_name": "1", + "entity_type": "TASK_OR_PROBLEM", + "description": "1 represents the specific sequence of operators selected for the plan", + "source_ids": [ + 112 + ] + }, + { + "entity_name": "agent plan", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "agent plan is the specific formulation or function used to generate the plan from the query category and library", + "source_ids": [ + 112 + ] + }, + { + "entity_name": "equation 8", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 8 is the mathematical formulation agent plan describing the plan generation process", + "source_ids": [ + 112 + ] + }, + { + "entity_name": "", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 112 + ] + } + ], + "relations": [ + { + "src_entity_name": "agent", + "tgt_entity_name": "operator plan", + "relation_name": "", + "weight": 10.0, + "description": "the agent s final task is to generate the operator plan", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "query", + "relation_name": "", + "weight": 9.0, + "description": "the agent classifies the query into a category", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "category", + "relation_name": "", + "weight": 9.0, + "description": "the agent uses the category derived from the query to generate the plan", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "library", + "relation_name": "", + "weight": 8.0, + "description": "the agent selects operators from the library to form the plan", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "operators", + "relation_name": "", + "weight": 9.0, + "description": "the agent selects a specific sequence of operators to create the plan", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "query", + "tgt_entity_name": "parameters", + "relation_name": "", + "weight": 8.0, + "description": "parameters are dynamically instantiated based on the query", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "operators", + "tgt_entity_name": "parameters", + "relation_name": "", + "weight": 8.0, + "description": "operators are configured with parameters dynamically instantiated based on the query", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "", + "tgt_entity_name": "", + "relation_name": "", + "weight": 10.0, + "description": "the query is classified into the category", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 10.0, + "description": "the sequence 1 is selected from the library", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "agent plan", + "tgt_entity_name": "", + "relation_name": "", + "weight": 10.0, + "description": "the agent plan method defines the generation of the plan", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "equation 8", + "tgt_entity_name": "", + "relation_name": "", + "weight": 10.0, + "description": "equation 8 defines the variable", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "equation 8", + "tgt_entity_name": "agent plan", + "relation_name": "", + "weight": 10.0, + "description": "equation 8 utilizes the agent plan function", + "source_ids": [ + 112 + ] + }, + { + "src_entity_name": "parameters", + "tgt_entity_name": "operators", + "relation_name": "", + "weight": 8.0, + "description": "parameters are dynamically instantiated for the operators", + "source_ids": [ + 112 + ] + } + ], + "node_idx": 112 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_113.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_113.json new file mode 100644 index 0000000..b177c3c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_113.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (8)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable P as a function of Agent Plan with inputs q, c, and O. LaTeX: 𝑃 = Agent Plan ( 𝑞, 𝑐, O) (8)", + "source_ids": [ + 113 + ] + } + ], + "relations": [], + "node_idx": 113 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_114.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_114.json new file mode 100644 index 0000000..51854d7 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_114.json @@ -0,0 +1,51 @@ +{ + "entities": [ + { + "entity_name": "the plan", + "entity_type": "TASK_OR_PROBLEM", + "description": "the plan is a structured workflow tailored to each category", + "source_ids": [ + 114 + ] + }, + { + "entity_name": "workflow", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the workflow is a structured process followed by the plan", + "source_ids": [ + 114 + ] + }, + { + "entity_name": "category", + "entity_type": "CONCEPT", + "description": "category refers to the classifications to which the plan s workflow is tailored", + "source_ids": [ + 114 + ] + } + ], + "relations": [ + { + "src_entity_name": "the plan", + "tgt_entity_name": "workflow", + "relation_name": "", + "weight": 9.0, + "description": "the plan follows a structured workflow", + "source_ids": [ + 114 + ] + }, + { + "src_entity_name": "the plan", + "tgt_entity_name": "category", + "relation_name": "", + "weight": 8.0, + "description": "the plan s workflow is tailored to each category", + "source_ids": [ + 114 + ] + } + ], + "node_idx": 114 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_115.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_115.json new file mode 100644 index 0000000..006f7f1 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_115.json @@ -0,0 +1,171 @@ +{ + "entities": [ + { + "entity_name": "single hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "single hop is a task where an agent first attempts to extract an entity", + "source_ids": [ + 115 + ] + }, + { + "entity_name": "agent", + "entity_type": "PERSON", + "description": "the agent is an entity that attempts to extract an entity and executes selection strategies", + "source_ids": [ + 115 + ] + }, + { + "entity_name": "scent based", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "scent based is a selection strategy used by the agent if entity extraction is successful", + "source_ids": [ + 115 + ] + }, + { + "entity_name": "section based", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "section based is a fallback strategy used by the agent if entity extraction fails", + "source_ids": [ + 115 + ] + }, + { + "entity_name": "standard reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "standard reasoning is a process that both selection paths proceed to", + "source_ids": [ + 115 + ] + }, + { + "entity_name": "generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "generation is a process that both selection paths proceed to", + "source_ids": [ + 115 + ] + }, + { + "entity_name": "p std", + "entity_type": "EQUATION_OR_FORMULA", + "description": "p std denotes the standard reasoning and generation process", + "source_ids": [ + 115 + ] + }, + { + "entity_name": "entity", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity is the object that the agent attempts to extract in the single hop process", + "source_ids": [ + 115 + ] + } + ], + "relations": [ + { + "src_entity_name": "agent", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 9.0, + "description": "the agent performs the single hop task by attempting to extract an entity", + "source_ids": [ + 115 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "scent based", + "relation_name": "", + "weight": 8.0, + "description": "the agent executes the scent based selection strategy if entity extraction is successful", + "source_ids": [ + 115 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "section based", + "relation_name": "", + "weight": 8.0, + "description": "the agent falls back to the section based strategy if entity extraction fails", + "source_ids": [ + 115 + ] + }, + { + "src_entity_name": "scent based", + "tgt_entity_name": "standard reasoning", + "relation_name": "", + "weight": 7.0, + "description": "the scent based path proceeds to standard reasoning and generation", + "source_ids": [ + 115 + ] + }, + { + "src_entity_name": "section based", + "tgt_entity_name": "standard reasoning", + "relation_name": "", + "weight": 7.0, + "description": "the section based path proceeds to standard reasoning and generation", + "source_ids": [ + 115 + ] + }, + { + "src_entity_name": "standard reasoning", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 8.0, + "description": "standard reasoning and generation are linked processes denoted as p std", + "source_ids": [ + 115 + ] + }, + { + "src_entity_name": "scent based", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 7.0, + "description": "the scent based path proceeds to generation as part of p std", + "source_ids": [ + 115 + ] + }, + { + "src_entity_name": "section based", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 7.0, + "description": "the section based path proceeds to generation as part of p std", + "source_ids": [ + 115 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "entity", + "relation_name": "", + "weight": 9.0, + "description": "the agent attempts to extract the entity as the first step of the single hop process", + "source_ids": [ + 115 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "entity", + "relation_name": "", + "weight": 8.0, + "description": "the single hop task involves the extraction of an entity", + "source_ids": [ + 115 + ] + } + ], + "node_idx": 115 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_116.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_116.json new file mode 100644 index 0000000..26759db --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_116.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (9)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable P_s based on extraction success or failure conditions. LaTeX: 𝑃 s = ( Extract success - - - - -→ Select_by_Entity → 𝑃 std Extract fail - -→ Select_by_Section → 𝑃 std (9)", + "source_ids": [ + 116 + ] + } + ], + "relations": [], + "node_idx": 116 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_117.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_117.json new file mode 100644 index 0000000..10ea4da --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_117.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (10)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the standard probability P as a process involving graph and text inputs leading to a skyline reduction. LaTeX: 𝑃 std = ( Graph ∥ Text ) → Skyline → Reduce (10)", + "source_ids": [ + 117 + ] + } + ], + "relations": [], + "node_idx": 117 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_118.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_118.json new file mode 100644 index 0000000..8c9e74f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_118.json @@ -0,0 +1,69 @@ +{ + "entities": [ + { + "entity_name": "single hop workflow", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "single hop workflow is a method denoted as ps used to solve sub problems", + "source_ids": [ + 118 + ] + }, + { + "entity_name": "ps", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "ps is the specific notation or identifier for the single hop workflow applied to sub problems", + "source_ids": [ + 118 + ] + }, + { + "entity_name": "agent", + "entity_type": "PERSON", + "description": "the agent is the entity performing the decomposition of the problem and the synthesis of results", + "source_ids": [ + 118 + ] + }, + { + "entity_name": "complex", + "entity_type": "TASK_OR_PROBLEM", + "description": "complex refers to the problem being decomposed by the agent", + "source_ids": [ + 118 + ] + } + ], + "relations": [ + { + "src_entity_name": "single hop workflow", + "tgt_entity_name": "ps", + "relation_name": "", + "weight": 10.0, + "description": "the single hop workflow is identified by the notation ps in the text", + "source_ids": [ + 118 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "complex", + "relation_name": "", + "weight": 9.0, + "description": "the agent decomposes the complex problem into sub problems", + "source_ids": [ + 118 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "single hop workflow", + "relation_name": "", + "weight": 10.0, + "description": "the agent applies the single hop workflow to each sub problem", + "source_ids": [ + 118 + ] + } + ], + "node_idx": 118 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_119.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_119.json new file mode 100644 index 0000000..dc7f650 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_119.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (11)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation describing a decomposition process involving mapping and reduction. LaTeX: 𝑃 complex = Decompose → 𝑃 s → Map → Reduce (11)", + "source_ids": [ + 119 + ] + } + ], + "relations": [], + "node_idx": 119 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_12.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_12.json new file mode 100644 index 0000000..cb7ae98 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_12.json @@ -0,0 +1,79 @@ +{ + "entities": [ + { + "entity_name": "figure 1", + "entity_type": "IMAGE", + "description": "figure 1 is an image that presents a comparison of existing methods and bookrag for complex document qa", + "source_ids": [ + 12 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a product or method being compared against existing methods for complex document qa", + "source_ids": [ + 12 + ] + }, + { + "entity_name": "existing methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "existing methods refers to current techniques used for complex document qa which are being compared to bookrag", + "source_ids": [ + 12 + ] + }, + { + "entity_name": "complex document qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "complex document qa is the specific task or problem domain where the comparison between methods and bookrag is taking place", + "source_ids": [ + 12 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 1", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "figure 1 displays a comparison involving bookrag", + "source_ids": [ + 12 + ] + }, + { + "src_entity_name": "figure 1", + "tgt_entity_name": "existing methods", + "relation_name": "", + "weight": 9.0, + "description": "figure 1 displays a comparison involving existing methods", + "source_ids": [ + 12 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "complex document qa", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is a solution or method applied to the task of complex document qa", + "source_ids": [ + 12 + ] + }, + { + "src_entity_name": "existing methods", + "tgt_entity_name": "complex document qa", + "relation_name": "", + "weight": 8.0, + "description": "existing methods are techniques used for the task of complex document qa", + "source_ids": [ + 12 + ] + } + ], + "node_idx": 12 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_120.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_120.json new file mode 100644 index 0000000..5c95d0f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_120.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "global aggregation", + "entity_type": "TASK_OR_PROBLEM", + "description": "global aggregation is a workflow involving a sequence of filters followed by synthesis", + "source_ids": [ + 120 + ] + } + ], + "relations": [], + "node_idx": 120 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_121.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_121.json new file mode 100644 index 0000000..b7dbfeb --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_121.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (12)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the global probability P as a composition of filtering and mapping operations. LaTeX: 𝑃 global = GLYPH<214> ( Filter_Modal | Filter_Range ) → Map → Reduce (12)", + "source_ids": [ + 121 + ] + } + ], + "relations": [], + "node_idx": 121 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_122.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_122.json new file mode 100644 index 0000000..f97330e --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_122.json @@ -0,0 +1,99 @@ +{ + "entities": [ + { + "entity_name": "modal filter", + "entity_type": "TECHNOLOGY", + "description": "modal filter is a type of filter applied at each step of the nested composition", + "source_ids": [ + 122 + ] + }, + { + "entity_name": "range filter", + "entity_type": "TECHNOLOGY", + "description": "range filter is a type of filter applied at each step of the nested composition", + "source_ids": [ + 122 + ] + }, + { + "entity_name": "nested composition", + "entity_type": "TASK_OR_PROBLEM", + "description": "nested composition refers to the process of applying filters at each step", + "source_ids": [ + 122 + ] + }, + { + "entity_name": "", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 122 + ] + } + ], + "relations": [ + { + "src_entity_name": "", + "tgt_entity_name": "", + "relation_name": "", + "weight": 10.0, + "description": "the symbol denotes the nested composition of filters applying either a modal or range filter at each step", + "source_ids": [ + 122 + ] + }, + { + "src_entity_name": "", + "tgt_entity_name": "modal filter", + "relation_name": "", + "weight": 9.0, + "description": "the symbol denotes the application of a modal filter at each step", + "source_ids": [ + 122 + ] + }, + { + "src_entity_name": "", + "tgt_entity_name": "range filter", + "relation_name": "", + "weight": 9.0, + "description": "the symbol denotes the application of a range filter at each step", + "source_ids": [ + 122 + ] + }, + { + "src_entity_name": "", + "tgt_entity_name": "nested composition", + "relation_name": "", + "weight": 10.0, + "description": "the symbol represents the nested composition of filters", + "source_ids": [ + 122 + ] + }, + { + "src_entity_name": "modal filter", + "tgt_entity_name": "nested composition", + "relation_name": "", + "weight": 8.0, + "description": "modal filters are applied as part of the nested composition process", + "source_ids": [ + 122 + ] + }, + { + "src_entity_name": "range filter", + "tgt_entity_name": "nested composition", + "relation_name": "", + "weight": 8.0, + "description": "range filters are applied as part of the nested composition process", + "source_ids": [ + 122 + ] + } + ], + "node_idx": 122 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_123.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_123.json new file mode 100644 index 0000000..035b8a3 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_123.json @@ -0,0 +1,69 @@ +{ + "entities": [ + { + "entity_name": "5.3 structured execution", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Agent-Based Retrieval', this section details the retrieval process within the BookRAG framework, specifically focusing on operations executed under the principles of In-Context Few-Shot Training (IFT) and generation.", + "source_ids": [ + 123 + ] + }, + { + "entity_name": "retrieval process", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Refers to the specific mechanism for retrieving information from the BookIndex as described in section 5.3.", + "source_ids": [ + 123 + ] + }, + { + "entity_name": "ift principles", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Refers to the In-Context Few-Shot Training principles that guide the execution logic detailed in section 5.3.", + "source_ids": [ + 123 + ] + }, + { + "entity_name": "generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "Refers to the generative component integrated into the structured execution workflow mentioned in section 5.3.", + "source_ids": [ + 123 + ] + } + ], + "relations": [ + { + "src_entity_name": "retrieval process", + "tgt_entity_name": "5.3 structured execution", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Retrieval Process' is a primary topic of section 5.3.", + "source_ids": [ + 123 + ] + }, + { + "src_entity_name": "ift principles", + "tgt_entity_name": "5.3 structured execution", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'IFT Principles' is a primary topic of section 5.3.", + "source_ids": [ + 123 + ] + }, + { + "src_entity_name": "generation", + "tgt_entity_name": "5.3 structured execution", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Generation' is a primary topic of section 5.3.", + "source_ids": [ + 123 + ] + } + ], + "node_idx": 123 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_124.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_124.json new file mode 100644 index 0000000..3e99acf --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_124.json @@ -0,0 +1,391 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "SOFTWARE", + "description": "bookrag is a system that executes a generated workflow and embodies cognitive principles of information foraging theory", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "information foraging theory", + "entity_type": "SCIENTIFIC_THEORY", + "description": "information foraging theory ift is the cognitive principle embodied by bookrag s execution phase", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "selector", + "entity_type": "SOFTWARE", + "description": "selector is an operator in bookrag that navigates to information patches", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "reasoner", + "entity_type": "SOFTWARE", + "description": "reasoner is an operator in bookrag that performs sensemaking within information patches", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "synthesizer", + "entity_type": "SOFTWARE", + "description": "synthesizer is an operator in bookrag that generates the final answer based on processed evidence", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "workflow", + "entity_type": "TASK_OR_PROBLEM", + "description": "workflow is the generated sequence of operations executed by bookrag", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "p", + "entity_type": "TASK_OR_PROBLEM", + "description": "p represents the specific generated workflow executed by bookrag", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "abstract textual queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "abstract textual queries are the input that bookrag translates into concrete operations", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "concrete operations", + "entity_type": "TASK_OR_PROBLEM", + "description": "concrete operations are the result of translating abstract textual queries within bookrag", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "information patches", + "entity_type": "TASK_OR_PROBLEM", + "description": "information patches are specific scopes within the document space that the selector navigates to", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "document space", + "entity_type": "TASK_OR_PROBLEM", + "description": "document space is the vast area of documents that is narrowed down by the selector", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "relevant scopes", + "entity_type": "TASK_OR_PROBLEM", + "description": "relevant scopes are the focused areas within the document space identified by the selector", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "sensemaking", + "entity_type": "TASK_OR_PROBLEM", + "description": "sensemaking is the process performed by the reasoner to analyze and refine information", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "processed evidence", + "entity_type": "TASK_OR_PROBLEM", + "description": "processed evidence is the refined information used by the synthesizer to generate the answer", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "answer", + "entity_type": "TASK_OR_PROBLEM", + "description": "answer is the final output generated by the synthesizer", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "cost of attention", + "entity_type": "TASK_OR_PROBLEM", + "description": "cost of attention is a metric minimized by bookrag s design to focus computational resources", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "computational resources", + "entity_type": "TASK_OR_PROBLEM", + "description": "computational resources are the assets focused by bookrag on high value data patches", + "source_ids": [ + 124 + ] + }, + { + "entity_name": "high value data patches", + "entity_type": "TASK_OR_PROBLEM", + "description": "high value data patches are the specific data areas where bookrag focuses its computational resources", + "source_ids": [ + 124 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "information foraging theory", + "relation_name": "", + "weight": 10.0, + "description": "bookrag embodies the cognitive principles of information foraging theory during its execution phase", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes the selector operator to navigate to information patches", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes the reasoner operator to perform sensemaking within information patches", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "synthesizer", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes the synthesizer operator to generate the final answer", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 8.0, + "description": "the selector operator narrows the document space which is subsequently analyzed by the reasoner operator", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "reasoner", + "tgt_entity_name": "synthesizer", + "relation_name": "", + "weight": 8.0, + "description": "the reasoner operator refines information that is then used by the synthesizer to generate the answer", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "workflow", + "relation_name": "", + "weight": 10.0, + "description": "bookrag executes the generated workflow p", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "p", + "relation_name": "", + "weight": 10.0, + "description": "p is the specific workflow executed by bookrag", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "abstract textual queries", + "relation_name": "", + "weight": 9.0, + "description": "bookrag translates abstract textual queries into concrete operations", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "concrete operations", + "relation_name": "", + "weight": 9.0, + "description": "bookrag produces concrete operations from abstract queries", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "cost of attention", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s design minimizes the cost of attention", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "computational resources", + "relation_name": "", + "weight": 9.0, + "description": "bookrag ensures computational resources are focused on high value data", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "high value data patches", + "relation_name": "", + "weight": 9.0, + "description": "bookrag focuses computational resources solely on high value data patches", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 10.0, + "description": "the selector operator navigates to information patches", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "document space", + "relation_name": "", + "weight": 9.0, + "description": "the selector narrows the vast document space down to relevant scopes", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "relevant scopes", + "relation_name": "", + "weight": 9.0, + "description": "the selector narrows the document space down to relevant scopes", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "reasoner", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 9.0, + "description": "the reasoner performs sensemaking within the information patches identified by the selector", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "reasoner", + "tgt_entity_name": "processed evidence", + "relation_name": "", + "weight": 9.0, + "description": "the reasoner analyzes and refines information to create processed evidence", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "processed evidence", + "relation_name": "", + "weight": 10.0, + "description": "the synthesizer generates the answer based on the processed evidence", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "answer", + "relation_name": "", + "weight": 10.0, + "description": "the synthesizer generates the answer", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "abstract textual queries", + "tgt_entity_name": "concrete operations", + "relation_name": "", + "weight": 8.0, + "description": "abstract textual queries are translated into concrete operations", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "information patches", + "tgt_entity_name": "sensemaking", + "relation_name": "", + "weight": 8.0, + "description": "sensemaking is performed within the information patches", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "processed evidence", + "tgt_entity_name": "answer", + "relation_name": "", + "weight": 8.0, + "description": "processed evidence is used to generate the answer", + "source_ids": [ + 124 + ] + }, + { + "src_entity_name": "computational resources", + "tgt_entity_name": "high value data patches", + "relation_name": "", + "weight": 8.0, + "description": "computational resources are focused on high value data patches", + "source_ids": [ + 124 + ] + } + ], + "node_idx": 124 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_125.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_125.json new file mode 100644 index 0000000..1c43a46 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_125.json @@ -0,0 +1,195 @@ +{ + "entities": [ + { + "entity_name": "scent filter based retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "scent filter based retrieval is a process described as the execution that begins by narrowing the scope", + "source_ids": [ + 125 + ] + }, + { + "entity_name": "ift", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "ift is a method or technique with which the execution aligns", + "source_ids": [ + 125 + ] + }, + { + "entity_name": "selector operators", + "entity_type": "SOFTWARE", + "description": "selector operators are components that identify relevant patches by following information scents or applying explicit filter constraints", + "source_ids": [ + 125 + ] + }, + { + "entity_name": "information scents", + "entity_type": "CONCEPT", + "description": "information scents are described as cues such as key entities in a question followed by selector operators", + "source_ids": [ + 125 + ] + }, + { + "entity_name": "node set n", + "entity_type": "DATASET_OR_CORPUS", + "description": "node set n represents the full set of nodes that is reduced by the process", + "source_ids": [ + 125 + ] + }, + { + "entity_name": "focused node subset ns", + "entity_type": "DATASET_OR_CORPUS", + "description": "focused node subset ns is the result of the reduction process applied to the full node set n", + "source_ids": [ + 125 + ] + }, + { + "entity_name": "params sel", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "params sel are parameters used in the selector function to define the focused node subset", + "source_ids": [ + 125 + ] + }, + { + "entity_name": "patches", + "entity_type": "PRODUCT", + "description": "patches are relevant units identified by selector operators within the retrieval process", + "source_ids": [ + 125 + ] + }, + { + "entity_name": "question", + "entity_type": "TASK_OR_PROBLEM", + "description": "a question is mentioned as a source of key entities used to identify information scents", + "source_ids": [ + 125 + ] + }, + { + "entity_name": "explicit filter constraints", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "explicit filter constraints are rules applied by selector operators to identify relevant patches", + "source_ids": [ + 125 + ] + }, + { + "entity_name": "equation 13", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 13 defines the mathematical relationship for the selector function reducing the node set", + "source_ids": [ + 125 + ] + } + ], + "relations": [ + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "ift", + "relation_name": "", + "weight": 9.0, + "description": "the scent filter based retrieval process aligns with ift", + "source_ids": [ + 125 + ] + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "selector operators", + "relation_name": "", + "weight": 10.0, + "description": "selector operators are the mechanism used within the scent filter based retrieval process to identify relevant patches", + "source_ids": [ + 125 + ] + }, + { + "src_entity_name": "selector operators", + "tgt_entity_name": "information scents", + "relation_name": "", + "weight": 9.0, + "description": "selector operators identify relevant patches by following information scents", + "source_ids": [ + 125 + ] + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "node set n", + "relation_name": "", + "weight": 10.0, + "description": "the process reduces the full node set n to a focused subset", + "source_ids": [ + 125 + ] + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "focused node subset ns", + "relation_name": "", + "weight": 10.0, + "description": "the process results in the creation of the focused node subset ns", + "source_ids": [ + 125 + ] + }, + { + "src_entity_name": "selector operators", + "tgt_entity_name": "params sel", + "relation_name": "", + "weight": 8.0, + "description": "selector operators utilize params sel in their function to reduce the node set", + "source_ids": [ + 125 + ] + }, + { + "src_entity_name": "selector operators", + "tgt_entity_name": "patches", + "relation_name": "", + "weight": 9.0, + "description": "selector operators identify relevant patches", + "source_ids": [ + 125 + ] + }, + { + "src_entity_name": "information scents", + "tgt_entity_name": "question", + "relation_name": "", + "weight": 8.0, + "description": "information scents include key entities found in a question", + "source_ids": [ + 125 + ] + }, + { + "src_entity_name": "selector operators", + "tgt_entity_name": "explicit filter constraints", + "relation_name": "", + "weight": 9.0, + "description": "selector operators apply explicit filter constraints to identify patches", + "source_ids": [ + 125 + ] + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "equation 13", + "relation_name": "", + "weight": 10.0, + "description": "equation 13 describes the execution of the scent filter based retrieval process", + "source_ids": [ + 125 + ] + } + ], + "node_idx": 125 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_126.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_126.json new file mode 100644 index 0000000..351e7e3 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_126.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (13)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable Ns as a selector function applied to N and parameters. LaTeX: 𝑁 𝑠 = Selector ( 𝑁, params sel ) (13)", + "source_ids": [ + 126 + ] + } + ], + "relations": [], + "node_idx": 126 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_127.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_127.json new file mode 100644 index 0000000..df655fe --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_127.json @@ -0,0 +1,293 @@ +{ + "entities": [ + { + "entity_name": "reasoner operators", + "entity_type": "TASK_OR_PROBLEM", + "description": "reasoner operators are components that evaluate nodes using multiple dimensions such as graph topology and semantic relevance", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "skyline ranker", + "entity_type": "TASK_OR_PROBLEM", + "description": "skyline ranker is a method employed to obtain the final retrieval set by retaining the pareto frontier of nodes", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "skyline operator", + "entity_type": "TASK_OR_PROBLEM", + "description": "the skyline operator is a mechanism that retains valuable nodes in at least one dimension while discarding dominated ones", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "n r", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "n r represents the final retrieval set derived from the skyline ranker process", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "s g n s", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "s g n s is a function or metric used within the skyline ranker equation to evaluate nodes", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "t n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "t n is a function or metric used within the skyline ranker equation to evaluate nodes", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "n s", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "n s represents the set of nodes from which the final retrieval set is derived", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "equation 14", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 14 defines the mathematical relationship for calculating the final retrieval set n r using the skyline ranker", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "graph topology", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "graph topology is a dimension used by reasoner operators to evaluate nodes", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "semantic relevance", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "semantic relevance is a dimension used by reasoner operators to evaluate nodes", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "pareto frontier", + "entity_type": "CONCEPT", + "description": "the pareto frontier is the set of nodes retained by the skyline operator that are valuable in at least one dimension", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "fixed top retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "fixed top retrieval is a method contrasted with the skyline operator for its inability to retain the pareto frontier", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "noise", + "entity_type": "CONCEPT", + "description": "noise is a factor minimized by the pre selection process to optimize foraging cost", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "foraging cost", + "entity_type": "MEASUREMENT", + "description": "foraging cost is the metric optimized by minimizing noise and focusing on relevant contexts", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "pre selection", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "pre selection is a process that minimizes noise and ensures reasoning is applied only to highly relevant contexts", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "nodes", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 127 + ] + }, + { + "entity_name": "final retrieval set", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 127 + ] + } + ], + "relations": [ + { + "src_entity_name": "reasoner operators", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "reasoner operators evaluate nodes using multiple dimensions like graph topology and semantic relevance", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "final retrieval set", + "relation_name": "", + "weight": 10.0, + "description": "the skyline ranker is employed to generate the final retrieval set", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "the skyline operator retains nodes that are valuable in at least one dimension and discards dominated ones", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "equation 14", + "relation_name": "", + "weight": 10.0, + "description": "equation 14 mathematically defines the operation of the skyline ranker", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "n r", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 10.0, + "description": "n r is the output variable resulting from the skyline ranker operation", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "s g n s", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 8.0, + "description": "s g n s is an input component used within the skyline ranker equation", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "t n", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 8.0, + "description": "t n is an input component used within the skyline ranker equation", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "n s", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 8.0, + "description": "n s is the set of nodes provided as input to the skyline ranker equation", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "reasoner operators", + "tgt_entity_name": "graph topology", + "relation_name": "", + "weight": 9.0, + "description": "reasoner operators use graph topology as a dimension for evaluation", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "reasoner operators", + "tgt_entity_name": "semantic relevance", + "relation_name": "", + "weight": 9.0, + "description": "reasoner operators use semantic relevance as a dimension for evaluation", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "pareto frontier", + "relation_name": "", + "weight": 10.0, + "description": "the skyline operator retains the pareto frontier of nodes", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "skyline operator", + "tgt_entity_name": "fixed top retrieval", + "relation_name": "", + "weight": 7.0, + "description": "the skyline operator is contrasted with fixed top retrieval in the text", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "pre selection", + "tgt_entity_name": "noise", + "relation_name": "", + "weight": 9.0, + "description": "pre selection minimizes noise", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "pre selection", + "tgt_entity_name": "foraging cost", + "relation_name": "", + "weight": 8.0, + "description": "pre selection optimizes the foraging cost", + "source_ids": [ + 127 + ] + }, + { + "src_entity_name": "n r", + "tgt_entity_name": "nodes", + "relation_name": "", + "weight": 9.0, + "description": "n r represents the set of retained nodes", + "source_ids": [ + 127 + ] + } + ], + "node_idx": 127 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_128.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_128.json new file mode 100644 index 0000000..0ecbb36 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_128.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (14)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable NR as a Skyline Ranker applied to a set of SG and T values. LaTeX: 𝑁 𝑅 = Skyline_Ranker ({ 𝑆 𝐺 ( 𝑛 , 𝑆 ) 𝑇 ( 𝑛 ) | 𝑛 ∈ 𝑁 𝑠 }) (14)", + "source_ids": [ + 128 + ] + } + ], + "relations": [], + "node_idx": 128 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_129.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_129.json new file mode 100644 index 0000000..43a820e --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_129.json @@ -0,0 +1,125 @@ +{ + "entities": [ + { + "entity_name": "synthesizer", + "entity_type": "SOFTWARE", + "description": "the synthesizer is an operator that generates a coherent answer by aggregating refined evidence", + "source_ids": [ + 129 + ] + }, + { + "entity_name": "analysis merging generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "analysis merging generation is described as the final stage of a process involving the synthesizer operator", + "source_ids": [ + 129 + ] + }, + { + "entity_name": "q", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "q is a variable representing a query or input used by the synthesizer operator", + "source_ids": [ + 129 + ] + }, + { + "entity_name": "n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "n is a variable representing refined evidence used by the synthesizer operator", + "source_ids": [ + 129 + ] + }, + { + "entity_name": "a", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "a is the variable representing the coherent answer generated by the synthesizer operator", + "source_ids": [ + 129 + ] + }, + { + "entity_name": "15", + "entity_type": "EQUATION_OR_FORMULA", + "description": "15 is the label or identifier for the equation describing the synthesizer operator s function", + "source_ids": [ + 129 + ] + } + ], + "relations": [ + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "analysis merging generation", + "relation_name": "", + "weight": 9.0, + "description": "the synthesizer operator is the key component used in the final stage of analysis merging generation", + "source_ids": [ + 129 + ] + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "q", + "relation_name": "", + "weight": 9.0, + "description": "the synthesizer operator takes q as an input parameter to generate the answer", + "source_ids": [ + 129 + ] + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "n", + "relation_name": "", + "weight": 9.0, + "description": "the synthesizer operator takes n as an input parameter to generate the answer", + "source_ids": [ + 129 + ] + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "a", + "relation_name": "", + "weight": 9.0, + "description": "the synthesizer operator produces a as its output", + "source_ids": [ + 129 + ] + }, + { + "src_entity_name": "a", + "tgt_entity_name": "15", + "relation_name": "", + "weight": 8.0, + "description": "a is the subject of the equation labeled 15", + "source_ids": [ + 129 + ] + }, + { + "src_entity_name": "q", + "tgt_entity_name": "15", + "relation_name": "", + "weight": 8.0, + "description": "q is a component of the equation labeled 15", + "source_ids": [ + 129 + ] + }, + { + "src_entity_name": "n", + "tgt_entity_name": "15", + "relation_name": "", + "weight": 8.0, + "description": "n is a component of the equation labeled 15", + "source_ids": [ + 129 + ] + } + ], + "node_idx": 129 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_13.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_13.json new file mode 100644 index 0000000..942d85a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_13.json @@ -0,0 +1,375 @@ +{ + "entities": [ + { + "entity_name": "cref='#/texts/14'", + "entity_type": "IMAGE", + "description": "A diagram comparing three RAG (Retrieval-Augmented Generation) architectures: Text-Only RAG, Layout Segmented RAG, and BookRAG.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "complex query", + "entity_type": "TASK_OR_PROBLEM", + "description": "The input task represented by a user icon with a question mark, initiating the process.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "complex multi-page document", + "entity_type": "PRODUCT", + "description": "The source document containing multiple pages that serves as the input data.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "text-only rag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Section (a) of the diagram illustrating a Retrieval-Augmented Generation approach using plain text extraction.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "plain text extraction (ocr)", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The first step in Text-Only RAG where text is extracted from the document images.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "unstructured chunks", + "entity_type": "DATASET_OR_CORPUS", + "description": "The output of OCR processing, representing fragmented text segments without structural context.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "text index (vector/graph/tree)", + "entity_type": "SYSTEM_COMPONENT", + "description": "The indexing structure created to store and organize the unstructured chunks for retrieval.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "fixed/ graph retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The retrieval mechanism used to find relevant information from the index.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "llm", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "Large Language Model depicted as a robot head, which generates the final answer based on retrieved information.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "fails on structural dependencies", + "entity_type": "TASK_OR_PROBLEM", + "description": "A limitation identified in the Text-Only RAG approach regarding its inability to handle complex structures.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "layout segmented rag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Section (b) of the diagram illustrating a RAG approach that segments content based on layout analysis.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "layout analysis & parsing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The initial step in this section where the document's visual layout is analyzed and parsed.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "flattened chunks", + "entity_type": "DATASET_OR_CORPUS", + "description": "Chunks derived from layout analysis but flattened, losing some hierarchical relationships.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "flattened vector index", + "entity_type": "SYSTEM_COMPONENT", + "description": "An index built upon the flattened chunks.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "fixed retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The retrieval method used in the Layout Segmented RAG pipeline.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "loses complex relationships", + "entity_type": "TASK_OR_PROBLEM", + "description": "A drawback noted for the Layout Segmented RAG approach due to flattening the data.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "bookrag (natively structure-aware)", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Section (c) of the diagram presenting the proposed solution, a structure-aware RAG architecture.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "hierarchical chunks", + "entity_type": "DATASET_OR_CORPUS", + "description": "Chunks that preserve the hierarchical structure of the document.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "SYSTEM_COMPONENT", + "description": "A graph-based index representing the hierarchical relationships between chunks.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "agent-based retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A retrieval strategy utilizing an agent to navigate the BookIndex graph effectively.", + "source_ids": [ + 13 + ] + }, + { + "entity_name": "accurate, structured-grounded", + "entity_type": "EVALUATION_METRIC", + "description": "The positive outcome achieved by the BookRAG system, indicating high accuracy and structural awareness.", + "source_ids": [ + 13 + ] + } + ], + "relations": [ + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "complex query", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Complex Query", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "complex multi-page document", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Complex Multi-page Document", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "text-only rag", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Text-Only RAG", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "plain text extraction (ocr)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Plain Text Extraction (OCR)", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "unstructured chunks", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Unstructured Chunks", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "text index (vector/graph/tree)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Text Index (Vector/Graph/Tree)", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "fixed/ graph retrieval", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Fixed/ Graph Retrieval", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to LLM", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "fails on structural dependencies", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Fails on Structural dependencies", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "layout segmented rag", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Layout Segmented RAG", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "layout analysis & parsing", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Layout Analysis & Parsing", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "flattened chunks", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Flattened Chunks", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "flattened vector index", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Flattened Vector Index", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "fixed retrieval", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Fixed Retrieval", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "loses complex relationships", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Loses complex relationships", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "bookrag (natively structure-aware)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to BookRAG (Natively Structure-aware)", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "hierarchical chunks", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Hierarchical Chunks", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to BookIndex", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "agent-based retrieval", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Agent-based Retrieval", + "source_ids": [ + 13 + ] + }, + { + "src_entity_name": "cref='#/texts/14'", + "tgt_entity_name": "accurate, structured-grounded", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/14' related to Accurate, structured-grounded", + "source_ids": [ + 13 + ] + } + ], + "node_idx": 13 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_130.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_130.json new file mode 100644 index 0000000..5236316 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_130.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (15)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable A as the output of a Synthesizer function. LaTeX: 𝐴 = Synthesizer ( 𝑞, 𝑁 𝑅 ) (15)", + "source_ids": [ + 130 + ] + } + ], + "relations": [], + "node_idx": 130 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_131.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_131.json new file mode 100644 index 0000000..213290b --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_131.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "table 3", + "entity_type": "TABLE", + "description": "table 3 is a table that categorizes operators utilized in bookrag by their function", + "source_ids": [ + 131 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a product or system that utilizes various operators categorized by function", + "source_ids": [ + 131 + ] + } + ], + "relations": [ + { + "src_entity_name": "table 3", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "table 3 details the operators used within the bookrag system", + "source_ids": [ + 131 + ] + } + ], + "node_idx": 131 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_132.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_132.json new file mode 100644 index 0000000..dfd9923 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_132.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "table: cref='#/texts/136'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/136'", + "source_ids": [ + 132 + ] + }, + { + "entity_name": "cref", + "entity_type": "EQUATION_OR_FORMULA", + "description": "A cross-reference identifier or formula string found in the description, pointing to a specific text location ('#/texts/136').", + "source_ids": [ + 132 + ] + } + ], + "relations": [ + { + "src_entity_name": "table: cref='#/texts/136'...", + "tgt_entity_name": "cref", + "relation_name": "", + "weight": 9.0, + "description": "Table 'Table: cref='#/texts/136'...' contains data about 'cref'.", + "source_ids": [ + 132 + ] + } + ], + "node_idx": 132 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_133.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_133.json new file mode 100644 index 0000000..e617d83 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_133.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "8", + "entity_type": "MEASUREMENT", + "description": "8 is a numerical value mentioned in the text likely representing a count or identifier", + "source_ids": [ + 133 + ] + } + ], + "relations": [], + "node_idx": 133 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_134.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_134.json new file mode 100644 index 0000000..33dda76 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_134.json @@ -0,0 +1,231 @@ +{ + "entities": [ + { + "entity_name": "map operator", + "entity_type": "TASK_OR_PROBLEM", + "description": "the map operator is a component that performs fine grained analysis on individual evidence blocks or sub problems", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "decompose", + "entity_type": "TASK_OR_PROBLEM", + "description": "decompose is a process that generates sub problems which are analyzed by the map operator", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "reduce operator", + "entity_type": "TASK_OR_PROBLEM", + "description": "the reduce operator is a component that aggregates partial results to construct the final response", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "global filter", + "entity_type": "TASK_OR_PROBLEM", + "description": "the global filter is a mechanism used to generate statistical counts as partial results", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "evidence blocks", + "entity_type": "TASK_OR_PROBLEM", + "description": "evidence blocks are the individual units of content that the map operator analyzes", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "sub problems", + "entity_type": "TASK_OR_PROBLEM", + "description": "sub problems are specific issues derived from decompose that are analyzed by the map operator", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "intermediate insights", + "entity_type": "TASK_OR_PROBLEM", + "description": "intermediate insights are the outputs generated by the map operator during its analysis", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "partial results", + "entity_type": "TASK_OR_PROBLEM", + "description": "partial results are the outputs from the map operator that are aggregated by the reduce operator", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "answers to decomposed sub queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "answers to decomposed sub queries are a type of partial result aggregated by the reduce operator", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "statistical counts", + "entity_type": "TASK_OR_PROBLEM", + "description": "statistical counts are a type of partial result derived from a global filter and aggregated by the reduce operator", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "final response", + "entity_type": "TASK_OR_PROBLEM", + "description": "the final response is the constructed output created by the reduce operator", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "detailed content extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "detailed content extraction is a capability handled by the system s separation of map and reduce operators", + "source_ids": [ + 134 + ] + }, + { + "entity_name": "high level reasoning synthesis", + "entity_type": "TASK_OR_PROBLEM", + "description": "high level reasoning synthesis is a capability handled by the system s separation of map and reduce operators", + "source_ids": [ + 134 + ] + } + ], + "relations": [ + { + "src_entity_name": "map operator", + "tgt_entity_name": "decompose", + "relation_name": "", + "weight": 9.0, + "description": "the map operator analyzes sub problems generated from the decompose process", + "source_ids": [ + 134 + ] + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "map operator", + "relation_name": "", + "weight": 9.0, + "description": "the reduce operator aggregates the partial results generated by the map operator", + "source_ids": [ + 134 + ] + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "global filter", + "relation_name": "", + "weight": 8.0, + "description": "the reduce operator aggregates statistical counts derived from the global filter", + "source_ids": [ + 134 + ] + }, + { + "src_entity_name": "map operator", + "tgt_entity_name": "evidence blocks", + "relation_name": "", + "weight": 9.0, + "description": "the map operator performs analysis on individual evidence blocks", + "source_ids": [ + 134 + ] + }, + { + "src_entity_name": "map operator", + "tgt_entity_name": "sub problems", + "relation_name": "", + "weight": 9.0, + "description": "the map operator performs analysis on sub problems", + "source_ids": [ + 134 + ] + }, + { + "src_entity_name": "map operator", + "tgt_entity_name": "intermediate insights", + "relation_name": "", + "weight": 9.0, + "description": "the map operator generates intermediate insights as its output", + "source_ids": [ + 134 + ] + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "partial results", + "relation_name": "", + "weight": 9.0, + "description": "the reduce operator aggregates partial results", + "source_ids": [ + 134 + ] + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "answers to decomposed sub queries", + "relation_name": "", + "weight": 8.0, + "description": "the reduce operator aggregates answers to decomposed sub queries", + "source_ids": [ + 134 + ] + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "statistical counts", + "relation_name": "", + "weight": 8.0, + "description": "the reduce operator aggregates statistical counts", + "source_ids": [ + 134 + ] + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "final response", + "relation_name": "", + "weight": 9.0, + "description": "the reduce operator constructs the final response", + "source_ids": [ + 134 + ] + }, + { + "src_entity_name": "map operator", + "tgt_entity_name": "detailed content extraction", + "relation_name": "", + "weight": 8.0, + "description": "the map operator is responsible for detailed content extraction", + "source_ids": [ + 134 + ] + }, + { + "src_entity_name": "reduce operator", + "tgt_entity_name": "high level reasoning synthesis", + "relation_name": "", + "weight": 8.0, + "description": "the reduce operator is responsible for high level reasoning synthesis", + "source_ids": [ + 134 + ] + } + ], + "node_idx": 134 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_135.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_135.json new file mode 100644 index 0000000..5967cd2 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_135.json @@ -0,0 +1,283 @@ +{ + "entities": [ + { + "entity_name": "figure 4 b", + "entity_type": "IMAGE", + "description": "figure 4 b is an image presenting an execution trace for a single hop query", + "source_ids": [ + 135 + ] + }, + { + "entity_name": "single hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "single hop refers to a specific type of query being illustrated in the text", + "source_ids": [ + 135 + ] + }, + { + "entity_name": "car", + "entity_type": "PRODUCT", + "description": "car is a key entity identified in the query what is the type of car in the ranking prompt example", + "source_ids": [ + 135 + ] + }, + { + "entity_name": "ranking prompt example", + "entity_type": "TASK_OR_PROBLEM", + "description": "ranking prompt example is a specific example context mentioned in the query regarding the type of car", + "source_ids": [ + 135 + ] + }, + { + "entity_name": "extract", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "extract is a method used to identify key entities like car", + "source_ids": [ + 135 + ] + }, + { + "entity_name": "select by entity", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "select by entity is a method used to retrieve relevant nodes after entity identification", + "source_ids": [ + 135 + ] + }, + { + "entity_name": "skyline filtering", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "skyline filtering is a technique used to refine nodes during the process", + "source_ids": [ + 135 + ] + }, + { + "entity_name": "reduce", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "reduce is a method used to synthesize the final answer", + "source_ids": [ + 135 + ] + }, + { + "entity_name": "agent", + "entity_type": "PERSON", + "description": "the agent is an entity that classifies queries and generates workflows in the described process", + "source_ids": [ + 135 + ] + }, + { + "entity_name": "planning phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "the planning phase is the initial stage where the agent classifies the query and generates a workflow", + "source_ids": [ + 135 + ] + }, + { + "entity_name": "reasoning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "reasoning is a step used to refine nodes in the process", + "source_ids": [ + 135 + ] + }, + { + "entity_name": "answer", + "entity_type": "TASK_OR_PROBLEM", + "description": "the answer is the final output synthesized by the agent using the reduce method", + "source_ids": [ + 135 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 4 b", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 10.0, + "description": "figure 4 b presents the execution trace for the single hop query", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 9.0, + "description": "the single hop query asks about the type of car in the example", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "car", + "tgt_entity_name": "ranking prompt example", + "relation_name": "", + "weight": 8.0, + "description": "the car is the subject of the query within the ranking prompt example context", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 9.0, + "description": "the extract method is used to identify the entity car", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "select by entity", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 8.0, + "description": "the select by entity method retrieves nodes related to the identified entity car", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "skyline filtering", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 7.0, + "description": "the skyline filtering technique refines the nodes related to car", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "reduce", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 7.0, + "description": "the reduce method synthesizes the answer regarding the car", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "planning phase", + "relation_name": "", + "weight": 9.0, + "description": "the agent operates during the planning phase to classify queries and generate workflows", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "extract", + "relation_name": "", + "weight": 9.0, + "description": "the agent uses the extract method to identify key entities", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "select by entity", + "relation_name": "", + "weight": 9.0, + "description": "the agent uses the select by entity method to retrieve relevant nodes", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "reasoning", + "relation_name": "", + "weight": 8.0, + "description": "the agent applies reasoning to refine nodes", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "skyline filtering", + "relation_name": "", + "weight": 8.0, + "description": "the agent applies skyline filtering to refine nodes", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 9.0, + "description": "the agent uses the reduce method to synthesize the answer", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "planning phase", + "tgt_entity_name": "agent", + "relation_name": "", + "weight": 9.0, + "description": "the planning phase is conducted by the agent", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "select by entity", + "relation_name": "", + "weight": 7.0, + "description": "the extract method precedes the select by entity method in the workflow", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "select by entity", + "tgt_entity_name": "reasoning", + "relation_name": "", + "weight": 7.0, + "description": "the select by entity method is followed by reasoning in the workflow", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "reasoning", + "tgt_entity_name": "skyline filtering", + "relation_name": "", + "weight": 7.0, + "description": "reasoning is followed by skyline filtering in the workflow", + "source_ids": [ + 135 + ] + }, + { + "src_entity_name": "skyline filtering", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 7.0, + "description": "skyline filtering is followed by the reduce method in the workflow", + "source_ids": [ + 135 + ] + } + ], + "node_idx": 135 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_136.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_136.json new file mode 100644 index 0000000..0fcb7e0 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_136.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "6 experiments", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section details the empirical validation of the proposed BookRAG method, including experimental setup, benchmarks used, and performance results compared to baselines.", + "source_ids": [ + 136 + ] + }, + { + "entity_name": "experiments", + "entity_type": "TASK_OR_PROBLEM", + "description": "Refers to the systematic computational procedures and evaluations conducted to validate the effectiveness of the BookRAG approach, as described in section 6.", + "source_ids": [ + 136 + ] + } + ], + "relations": [ + { + "src_entity_name": "experiments", + "tgt_entity_name": "6 experiments", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Experiments' is the primary topic covered in section 6.", + "source_ids": [ + 136 + ] + } + ], + "node_idx": 136 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_137.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_137.json new file mode 100644 index 0000000..5d545ef --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_137.json @@ -0,0 +1,127 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a method or system being evaluated in the experiments against baseline methods", + "source_ids": [ + 137 + ] + }, + { + "entity_name": "document qa tasks", + "entity_type": "TASK_OR_PROBLEM", + "description": "document qa tasks are the specific problems on which the efficiency and accuracy of bookrag and baseline methods are compared", + "source_ids": [ + 137 + ] + }, + { + "entity_name": "baseline methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "baseline methods are the strong existing approaches used for comparison against bookrag in the experiments", + "source_ids": [ + 137 + ] + }, + { + "entity_name": "efficiency", + "entity_type": "EVALUATION_METRIC", + "description": "efficiency is a metric used to evaluate the performance of bookrag and baseline methods", + "source_ids": [ + 137 + ] + }, + { + "entity_name": "accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "accuracy is a metric used to evaluate the performance of bookrag and baseline methods", + "source_ids": [ + 137 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "document qa tasks", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is evaluated for its efficiency and accuracy specifically on document qa tasks", + "source_ids": [ + 137 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "baseline methods", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is compared against several strong baseline methods in the experiments", + "source_ids": [ + 137 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "efficiency", + "relation_name": "", + "weight": 8.0, + "description": "the efficiency of bookrag is evaluated and compared in the experiments", + "source_ids": [ + 137 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 8.0, + "description": "the accuracy of bookrag is evaluated and compared in the experiments", + "source_ids": [ + 137 + ] + }, + { + "src_entity_name": "baseline methods", + "tgt_entity_name": "efficiency", + "relation_name": "", + "weight": 8.0, + "description": "the efficiency of baseline methods is evaluated and compared in the experiments", + "source_ids": [ + 137 + ] + }, + { + "src_entity_name": "baseline methods", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 8.0, + "description": "the accuracy of baseline methods is evaluated and compared in the experiments", + "source_ids": [ + 137 + ] + }, + { + "src_entity_name": "document qa tasks", + "tgt_entity_name": "efficiency", + "relation_name": "", + "weight": 7.0, + "description": "efficiency is measured specifically on document qa tasks", + "source_ids": [ + 137 + ] + }, + { + "src_entity_name": "document qa tasks", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 7.0, + "description": "accuracy is measured specifically on document qa tasks", + "source_ids": [ + 137 + ] + } + ], + "node_idx": 137 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_138.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_138.json new file mode 100644 index 0000000..c6815ee --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_138.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "6.1 setup", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experiments' within the BookRAG paper, this section details the experimental configuration, including baseline methods, evaluation metrics (efficiency and accuracy), and the document QA tasks used to assess the proposed approach.", + "source_ids": [ + 138 + ] + } + ], + "relations": [], + "node_idx": 138 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_139.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_139.json new file mode 100644 index 0000000..1ac8190 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_139.json @@ -0,0 +1,121 @@ +{ + "entities": [ + { + "entity_name": "table 4", + "entity_type": "TABLE", + "description": "table 4 is a table listing datasets used in experiments", + "source_ids": [ + 139 + ] + }, + { + "entity_name": "em", + "entity_type": "EVALUATION_METRIC", + "description": "em denotes exact match an evaluation metric used in the experiments", + "source_ids": [ + 139 + ] + }, + { + "entity_name": "f1", + "entity_type": "EVALUATION_METRIC", + "description": "f1 denotes f1 score an evaluation metric used in the experiments", + "source_ids": [ + 139 + ] + }, + { + "entity_name": "exact match", + "entity_type": "EVALUATION_METRIC", + "description": "exact match is the full name for the metric abbreviated as em", + "source_ids": [ + 139 + ] + }, + { + "entity_name": "f1 score", + "entity_type": "EVALUATION_METRIC", + "description": "f1 score is the full name for the metric abbreviated as f1", + "source_ids": [ + 139 + ] + }, + { + "entity_name": "datasets", + "entity_type": "DATASET_OR_CORPUS", + "description": "datasets refer to the collection of data used in the experiments mentioned in the text", + "source_ids": [ + 139 + ] + }, + { + "entity_name": "our", + "entity_type": "ORGANIZATION", + "description": "our refers to the research group or team conducting the experiments mentioned in the text", + "source_ids": [ + 139 + ] + }, + { + "entity_name": "experiments", + "entity_type": "TASK_OR_PROBLEM", + "description": "experiments are the activities for which the datasets in table 4 were used", + "source_ids": [ + 139 + ] + } + ], + "relations": [ + { + "src_entity_name": "table 4", + "tgt_entity_name": "datasets", + "relation_name": "", + "weight": 10.0, + "description": "table 4 lists the datasets used in the experiments", + "source_ids": [ + 139 + ] + }, + { + "src_entity_name": "em", + "tgt_entity_name": "exact match", + "relation_name": "", + "weight": 10.0, + "description": "em is the abbreviation for exact match", + "source_ids": [ + 139 + ] + }, + { + "src_entity_name": "f1", + "tgt_entity_name": "f1 score", + "relation_name": "", + "weight": 10.0, + "description": "f1 is the abbreviation for f1 score", + "source_ids": [ + 139 + ] + }, + { + "src_entity_name": "our", + "tgt_entity_name": "experiments", + "relation_name": "", + "weight": 8.0, + "description": "our group conducted the experiments referenced in the text", + "source_ids": [ + 139 + ] + }, + { + "src_entity_name": "datasets", + "tgt_entity_name": "experiments", + "relation_name": "", + "weight": 9.0, + "description": "the datasets listed in table 4 were utilized in the experiments", + "source_ids": [ + 139 + ] + } + ], + "node_idx": 139 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_14.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_14.json new file mode 100644 index 0000000..7fca7f4 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_14.json @@ -0,0 +1,461 @@ +{ + "entities": [ + { + "entity_name": "financial auditing", + "entity_type": "TASK_OR_PROBLEM", + "description": "financial auditing is a task where llms are applied but may face challenges with domain knowledge", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "legal compliance", + "entity_type": "TASK_OR_PROBLEM", + "description": "legal compliance is a task where llms are applied but may face challenges with domain knowledge", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "scientific discovery", + "entity_type": "TASK_OR_PROBLEM", + "description": "scientific discovery is a task where llms are applied but may face challenges with domain knowledge", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "llms", + "entity_type": "TECHNOLOGY", + "description": "llms are large language models that may lead to missing domain knowledge and generating outdated information", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "retrieval augmented generation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "retrieval augmented generation rag is a method adopted to address llm limitations by retrieving relevant domain knowledge", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "rag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "rag is an abbreviation for retrieval augmented generation used to guide llms during response generation", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "enterprise scenarios", + "entity_type": "LOCATION", + "description": "enterprise scenarios are real world contexts where domain knowledge is stored in long form documents", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "technical handbooks", + "entity_type": "PRODUCT", + "description": "technical handbooks are long form documents where domain knowledge is often stored", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "api reference manuals", + "entity_type": "PRODUCT", + "description": "api reference manuals are long form documents where domain knowledge is often stored", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "operational guidebooks", + "entity_type": "PRODUCT", + "description": "operational guidebooks are long form documents where domain knowledge is often stored", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "books", + "entity_type": "PRODUCT", + "description": "books are a structure followed by long form documents characterized by intricate layouts and logical hierarchies", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "tables of contents", + "entity_type": "PRODUCT", + "description": "tables of contents are explicit structural elements found in long form documents", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "nested chapters", + "entity_type": "PRODUCT", + "description": "nested chapters are structural elements found in long form documents", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "multi level sections", + "entity_type": "PRODUCT", + "description": "multi level sections are structural elements found in long form documents", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "rag system", + "entity_type": "SOFTWARE", + "description": "a rag system is designed in this paper for qa over long and highly structured documents", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "qa refers to question answering the specific task the rag system is designed for", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "external sources", + "entity_type": "LOCATION", + "description": "external sources are referenced as the origin of relevant domain knowledge retrieved by rag", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "response generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "response generation is the process guided by rag to produce answers", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "domain knowledge", + "entity_type": "CONCEPT", + "description": "domain knowledge is the specific information retrieved from external sources to guide llms", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "long form documents", + "entity_type": "PRODUCT", + "description": "long form documents are the type of storage for domain knowledge in enterprise scenarios", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "intricate layouts", + "entity_type": "SHAPE", + "description": "intricate layouts are a feature of the structure of long form documents", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "logical hierarchies", + "entity_type": "CONCEPT", + "description": "logical hierarchies are a feature of the structure of long form documents", + "source_ids": [ + 14 + ] + }, + { + "entity_name": "this paper", + "entity_type": "BOOK", + "description": "this paper is the document where the authors aim to design an effective rag system", + "source_ids": [ + 14 + ] + } + ], + "relations": [ + { + "src_entity_name": "llms", + "tgt_entity_name": "financial auditing", + "relation_name": "", + "weight": 8.0, + "description": "llms are applied in financial auditing but may miss domain knowledge", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "llms", + "tgt_entity_name": "legal compliance", + "relation_name": "", + "weight": 8.0, + "description": "llms are applied in legal compliance but may miss domain knowledge", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "llms", + "tgt_entity_name": "scientific discovery", + "relation_name": "", + "weight": 8.0, + "description": "llms are applied in scientific discovery but may miss domain knowledge", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "llms", + "relation_name": "", + "weight": 9.0, + "description": "rag is used to guide llms during response generation to address their limitations", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "rag", + "tgt_entity_name": "llms", + "relation_name": "", + "weight": 9.0, + "description": "rag is used to guide llms during response generation to address their limitations", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "enterprise scenarios", + "relation_name": "", + "weight": 8.0, + "description": "rag is widely adopted in real world enterprise scenarios", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "enterprise scenarios", + "tgt_entity_name": "technical handbooks", + "relation_name": "", + "weight": 9.0, + "description": "domain knowledge in enterprise scenarios is often stored in technical handbooks", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "enterprise scenarios", + "tgt_entity_name": "api reference manuals", + "relation_name": "", + "weight": 9.0, + "description": "domain knowledge in enterprise scenarios is often stored in api reference manuals", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "enterprise scenarios", + "tgt_entity_name": "operational guidebooks", + "relation_name": "", + "weight": 9.0, + "description": "domain knowledge in enterprise scenarios is often stored in operational guidebooks", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "technical handbooks", + "tgt_entity_name": "books", + "relation_name": "", + "weight": 7.0, + "description": "technical handbooks follow the structure of books", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "api reference manuals", + "tgt_entity_name": "books", + "relation_name": "", + "weight": 7.0, + "description": "api reference manuals follow the structure of books", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "operational guidebooks", + "tgt_entity_name": "books", + "relation_name": "", + "weight": 7.0, + "description": "operational guidebooks follow the structure of books", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "books", + "tgt_entity_name": "tables of contents", + "relation_name": "", + "weight": 8.0, + "description": "books are characterized by explicit tables of contents", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "books", + "tgt_entity_name": "nested chapters", + "relation_name": "", + "weight": 8.0, + "description": "books are characterized by nested chapters", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "books", + "tgt_entity_name": "multi level sections", + "relation_name": "", + "weight": 8.0, + "description": "books are characterized by multi level sections", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "rag system", + "tgt_entity_name": "qa", + "relation_name": "", + "weight": 10.0, + "description": "the rag system is designed for qa over long and highly structured documents", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "rag system", + "tgt_entity_name": "technical handbooks", + "relation_name": "", + "weight": 8.0, + "description": "the rag system is designed to handle qa over documents like technical handbooks", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "rag system", + "tgt_entity_name": "api reference manuals", + "relation_name": "", + "weight": 8.0, + "description": "the rag system is designed to handle qa over documents like api reference manuals", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "rag system", + "tgt_entity_name": "operational guidebooks", + "relation_name": "", + "weight": 8.0, + "description": "the rag system is designed to handle qa over documents like operational guidebooks", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "external sources", + "relation_name": "", + "weight": 9.0, + "description": "rag retrieves relevant domain knowledge from external sources", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "response generation", + "relation_name": "", + "weight": 9.0, + "description": "rag is used to guide the llm during response generation", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "domain knowledge", + "relation_name": "", + "weight": 9.0, + "description": "rag retrieves domain knowledge to address llm limitations", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "enterprise scenarios", + "tgt_entity_name": "long form documents", + "relation_name": "", + "weight": 9.0, + "description": "domain knowledge in enterprise scenarios is stored in long form documents", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "long form documents", + "tgt_entity_name": "intricate layouts", + "relation_name": "", + "weight": 8.0, + "description": "long form documents are characterized by intricate layouts", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "long form documents", + "tgt_entity_name": "logical hierarchies", + "relation_name": "", + "weight": 8.0, + "description": "long form documents are characterized by rigorous logical hierarchies", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "this paper", + "tgt_entity_name": "rag system", + "relation_name": "", + "weight": 10.0, + "description": "this paper aims to design an effective rag system", + "source_ids": [ + 14 + ] + }, + { + "src_entity_name": "rag system", + "tgt_entity_name": "long form documents", + "relation_name": "", + "weight": 9.0, + "description": "the rag system is designed for qa over long and highly structured documents", + "source_ids": [ + 14 + ] + } + ], + "node_idx": 14 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_140.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_140.json new file mode 100644 index 0000000..d095fcd --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_140.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "table: cref='#/texts/143'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/143'", + "source_ids": [ + 140 + ] + }, + { + "entity_name": "texts/143", + "entity_type": "SECTION_TITLE", + "description": "A reference identifier extracted from the description string 'cref='#/texts/143'', likely pointing to a specific section or text element within a document structure.", + "source_ids": [ + 140 + ] + } + ], + "relations": [ + { + "src_entity_name": "table: cref='#/texts/143'...", + "tgt_entity_name": "texts/143", + "relation_name": "", + "weight": 9.0, + "description": "Table 'Table: cref='#/texts/143'...' contains data about 'texts/143'.", + "source_ids": [ + 140 + ] + } + ], + "node_idx": 140 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_141.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_141.json new file mode 100644 index 0000000..074575f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_141.json @@ -0,0 +1,389 @@ +{ + "entities": [ + { + "entity_name": "mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "mmlongbench is a comprehensive benchmark designed to evaluate qa capabilities on long form documents", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "m3docvqa", + "entity_type": "DATASET_OR_CORPUS", + "description": "m3docvqa is an open domain benchmark designed to test rag systems on html type documents from wikipedia", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "qasper is a qa dataset focused on scientific papers requiring evidence retrieval from the entire document", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "llm", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "llm is used to generate global questions from selected document elements", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "human annotators", + "entity_type": "PERSON", + "description": "human annotators are individuals who answer and refine the synthesized qa pairs", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "table 4", + "entity_type": "TABLE", + "description": "table 4 presents the statistics of the datasets mentioned", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "20", + "entity_type": "PERCENTAGE", + "description": "20 represents the proportion of the final qa pairs that are synthesized additional pairs", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "html type documents", + "entity_type": "PRODUCT", + "description": "html type documents are the source material for the m3docvqa benchmark", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "wikipedia pages", + "entity_type": "LOCATION", + "description": "wikipedia pages are the specific source of the html type documents used in m3docvqa", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "guidebooks", + "entity_type": "PRODUCT", + "description": "guidebooks are one of the diverse categories of long form documents covered by mmlongbench", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "financial reports", + "entity_type": "PRODUCT", + "description": "financial reports are one of the diverse categories of long form documents covered by mmlongbench", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "industry files", + "entity_type": "PRODUCT", + "description": "industry files are one of the diverse categories of long form documents covered by mmlongbench", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "scientific papers", + "entity_type": "PRODUCT", + "description": "scientific papers are the focus of the qasper dataset", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "tables", + "entity_type": "TABLE", + "description": "tables are document elements from which the llm generates global questions", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "figures", + "entity_type": "IMAGE", + "description": "figures are document elements from which the llm generates global questions", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "global level questions", + "entity_type": "TASK_OR_PROBLEM", + "description": "global level questions are the specific type of questions synthesized to address scarcity in original benchmarks", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "qa pairs", + "entity_type": "TASK_OR_PROBLEM", + "description": "qa pairs are the output units generated by the llm and refined by human annotators", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "rag systems", + "entity_type": "SOFTWARE", + "description": "rag systems are the target systems tested by the m3docvqa benchmark", + "source_ids": [ + 141 + ] + }, + { + "entity_name": "complex document qa tasks", + "entity_type": "TASK_OR_PROBLEM", + "description": "complex document qa tasks are the general category of problems addressed by the three benchmarks", + "source_ids": [ + 141 + ] + } + ], + "relations": [ + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "m3docvqa", + "relation_name": "", + "weight": 9.0, + "description": "both are widely adopted benchmarking datasets used for complex document qa tasks", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "both are widely adopted benchmarking datasets used for complex document qa tasks", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "m3docvqa", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "both are widely adopted benchmarking datasets used for complex document qa tasks", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "human annotators", + "relation_name": "", + "weight": 8.0, + "description": "the llm generates questions which are then answered and refined by human annotators", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "table 4", + "relation_name": "", + "weight": 6.0, + "description": "the llm s generated questions contribute to the statistics presented in table 4", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "human annotators", + "tgt_entity_name": "table 4", + "relation_name": "", + "weight": 6.0, + "description": "the work of human annotators contributes to the statistics presented in table 4", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "table 4", + "relation_name": "", + "weight": 7.0, + "description": "statistics for mmlongbench are presented in table 4", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "m3docvqa", + "tgt_entity_name": "table 4", + "relation_name": "", + "weight": 7.0, + "description": "statistics for m3docvqa are presented in table 4", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "qasper", + "tgt_entity_name": "table 4", + "relation_name": "", + "weight": 7.0, + "description": "statistics for qasper are presented in table 4", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "m3docvqa", + "tgt_entity_name": "html type documents", + "relation_name": "", + "weight": 9.0, + "description": "m3docvqa tests rag systems on a collection of html type documents", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "html type documents", + "tgt_entity_name": "wikipedia pages", + "relation_name": "", + "weight": 10.0, + "description": "the html type documents are sourced from wikipedia pages", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "guidebooks", + "relation_name": "", + "weight": 8.0, + "description": "mmlongbench covers guidebooks as a category of documents", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "financial reports", + "relation_name": "", + "weight": 8.0, + "description": "mmlongbench covers financial reports as a category of documents", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "industry files", + "relation_name": "", + "weight": 8.0, + "description": "mmlongbench covers industry files as a category of documents", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "qasper", + "tgt_entity_name": "scientific papers", + "relation_name": "", + "weight": 10.0, + "description": "qasper is focused on scientific papers", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "tables", + "relation_name": "", + "weight": 9.0, + "description": "the llm generates questions from tables", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "figures", + "relation_name": "", + "weight": 9.0, + "description": "the llm generates questions from figures", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "global level questions", + "relation_name": "", + "weight": 10.0, + "description": "the llm generates global level questions", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "human annotators", + "tgt_entity_name": "qa pairs", + "relation_name": "", + "weight": 9.0, + "description": "human annotators answer and refine qa pairs", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "m3docvqa", + "tgt_entity_name": "rag systems", + "relation_name": "", + "weight": 10.0, + "description": "m3docvqa is designed to test rag systems", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 9.0, + "description": "mmlongbench is used for complex document qa tasks", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "m3docvqa", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 9.0, + "description": "m3docvqa is used for complex document qa tasks", + "source_ids": [ + 141 + ] + }, + { + "src_entity_name": "qasper", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 9.0, + "description": "qasper is used for complex document qa tasks", + "source_ids": [ + 141 + ] + } + ], + "node_idx": 141 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_142.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_142.json new file mode 100644 index 0000000..dfe940a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_142.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "wikipedia", + "entity_type": "ORGANIZATION", + "description": "wikipedia is an organization associated with the url provided in the text", + "source_ids": [ + 142 + ] + }, + { + "entity_name": "https www wikipedia org", + "entity_type": "LOCATION", + "description": "https www wikipedia org is a web address mentioned in the text", + "source_ids": [ + 142 + ] + } + ], + "relations": [ + { + "src_entity_name": "wikipedia", + "tgt_entity_name": "https www wikipedia org", + "relation_name": "", + "weight": 10.0, + "description": "wikipedia is the organization represented by the url https www wikipedia org", + "source_ids": [ + 142 + ] + } + ], + "node_idx": 142 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_143.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_143.json new file mode 100644 index 0000000..b872897 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_143.json @@ -0,0 +1,5 @@ +{ + "entities": [], + "relations": [], + "node_idx": 143 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_144.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_144.json new file mode 100644 index 0000000..7adf379 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_144.json @@ -0,0 +1,429 @@ +{ + "entities": [ + { + "entity_name": "exact match", + "entity_type": "EVALUATION_METRIC", + "description": "exact match is a primary evaluation metric used to assess performance in the text", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "accuracy is a primary evaluation metric used to assess performance in the text", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "token based f1 score", + "entity_type": "EVALUATION_METRIC", + "description": "token based f1 score is a primary evaluation metric used to assess performance in the text", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "time cost", + "entity_type": "EVALUATION_METRIC", + "description": "time cost is a metric used to assess efficiency during the response phase", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "token usage", + "entity_type": "EVALUATION_METRIC", + "description": "token usage is a metric used to assess efficiency during the response phase", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "retrieval recall", + "entity_type": "EVALUATION_METRIC", + "description": "retrieval recall is a metric used to evaluate methods including pdf parsing", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "pdf parsing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "pdf parsing is a method mentioned in the text that is evaluated using retrieval recall", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "mmlongbench is a dataset that provides page numbers for filtering candidate blocks", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "qasper is a dataset that provides evidence statements for filtering candidate blocks", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "texts", + "entity_type": "TABLE", + "description": "texts are specific pdf blocks labeled to establish ground truth", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "titles", + "entity_type": "TABLE", + "description": "titles are specific pdf blocks labeled to establish ground truth", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "tables", + "entity_type": "TABLE", + "description": "tables are specific pdf blocks labeled to establish ground truth", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "images", + "entity_type": "TABLE", + "description": "images are specific pdf blocks labeled to establish ground truth", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "formulas", + "entity_type": "TABLE", + "description": "formulas are specific pdf blocks labeled to establish ground truth", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "qa refers to the question answering task for which official metrics are specified by each dataset", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "ground truth", + "entity_type": "CONCEPT", + "description": "ground truth is the established standard used to evaluate retrieval recall and guide manual labeling", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "metadata", + "entity_type": "CONCEPT", + "description": "metadata refers to the ground truth evidence information provided in each dataset that guides the labeling process", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "modality", + "entity_type": "CONCEPT", + "description": "modality is a given attribute used to filter candidate blocks across all datasets", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "query", + "entity_type": "TASK_OR_PROBLEM", + "description": "a query is a specific question for which retrieval recall is recorded particularly when pdf parsing errors occur", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "pdf blocks", + "entity_type": "TABLE", + "description": "pdf blocks are the specific units of content texts titles tables images formulas that are manually labeled", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "candidate blocks", + "entity_type": "TABLE", + "description": "candidate blocks are the set of blocks filtered using modality page numbers and evidence statements before manual annotation", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "response phase", + "entity_type": "TIME", + "description": "the response phase is the specific time period during which time cost and token usage are measured", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "page numbers", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 144 + ] + }, + { + "entity_name": "evidence statements", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 144 + ] + } + ], + "relations": [ + { + "src_entity_name": "exact match", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 9.0, + "description": "both are primary evaluation metrics used together in the assessment process", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "exact match", + "tgt_entity_name": "token based f1 score", + "relation_name": "", + "weight": 9.0, + "description": "both are primary evaluation metrics used together in the assessment process", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "accuracy", + "tgt_entity_name": "token based f1 score", + "relation_name": "", + "weight": 9.0, + "description": "both are primary evaluation metrics used together in the assessment process", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "time cost", + "tgt_entity_name": "token usage", + "relation_name": "", + "weight": 8.0, + "description": "both are metrics used to assess efficiency during the response phase", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "pdf parsing", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 10.0, + "description": "retrieval recall is the specific metric used to evaluate the pdf parsing method", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "page numbers", + "relation_name": "", + "weight": 7.0, + "description": "mmlongbench provides page numbers used to filter candidate blocks", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "qasper", + "tgt_entity_name": "evidence statements", + "relation_name": "", + "weight": 7.0, + "description": "qasper provides evidence statements used to filter candidate blocks", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "texts", + "tgt_entity_name": "formulas", + "relation_name": "", + "weight": 6.0, + "description": "both are types of pdf blocks manually labeled to establish ground truth", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "tables", + "tgt_entity_name": "images", + "relation_name": "", + "weight": 6.0, + "description": "both are types of pdf blocks manually labeled to establish ground truth", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "exact match", + "relation_name": "", + "weight": 9.0, + "description": "exact match is a metric used to evaluate the qa task", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 9.0, + "description": "accuracy is a metric used to evaluate the qa task", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "token based f1 score", + "relation_name": "", + "weight": 9.0, + "description": "token based f1 score is a metric used to evaluate the qa task", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "ground truth", + "tgt_entity_name": "pdf blocks", + "relation_name": "", + "weight": 10.0, + "description": "pdf blocks are manually labeled to establish the ground truth", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "ground truth", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "retrieval recall is measured against the ground truth", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "metadata", + "tgt_entity_name": "ground truth", + "relation_name": "", + "weight": 8.0, + "description": "metadata provides the ground truth evidence used to guide the labeling process", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "candidate blocks", + "tgt_entity_name": "pdf blocks", + "relation_name": "", + "weight": 7.0, + "description": "candidate blocks are filtered from the set of pdf blocks", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "candidate blocks", + "tgt_entity_name": "modality", + "relation_name": "", + "weight": 8.0, + "description": "candidate blocks are filtered using the given modality", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "candidate blocks", + "tgt_entity_name": "page numbers", + "relation_name": "", + "weight": 8.0, + "description": "candidate blocks are filtered using page numbers from mmlongbench", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "candidate blocks", + "tgt_entity_name": "evidence statements", + "relation_name": "", + "weight": 8.0, + "description": "candidate blocks are filtered using evidence statements from qasper", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "pdf parsing", + "tgt_entity_name": "pdf blocks", + "relation_name": "", + "weight": 7.0, + "description": "pdf parsing errors affect the availability of items within pdf blocks", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "query", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "retrieval recall is recorded for a specific query when a pdf parsing error occurs", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "time cost", + "tgt_entity_name": "response phase", + "relation_name": "", + "weight": 8.0, + "description": "time cost is measured during the response phase", + "source_ids": [ + 144 + ] + }, + { + "src_entity_name": "token usage", + "tgt_entity_name": "response phase", + "relation_name": "", + "weight": 8.0, + "description": "token usage is measured during the response phase", + "source_ids": [ + 144 + ] + } + ], + "node_idx": 144 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_145.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_145.json new file mode 100644 index 0000000..b04f160 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_145.json @@ -0,0 +1,61 @@ +{ + "entities": [ + { + "entity_name": "baselines", + "entity_type": "TASK_OR_PROBLEM", + "description": "baselines refer to the standard configurations used for comparison in the experiments", + "source_ids": [ + 145 + ] + }, + { + "entity_name": "three model configurations", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "three model configurations are the specific experimental setups considered in the study", + "source_ids": [ + 145 + ] + }, + { + "entity_name": "our experiments", + "entity_type": "EVENT", + "description": "our experiments refer to the specific study or set of trials being conducted to evaluate the model configurations", + "source_ids": [ + 145 + ] + } + ], + "relations": [ + { + "src_entity_name": "baselines", + "tgt_entity_name": "three model configurations", + "relation_name": "", + "weight": 9.0, + "description": "the baselines consist of or are defined by the three model configurations used in the experiments", + "source_ids": [ + 145 + ] + }, + { + "src_entity_name": "our experiments", + "tgt_entity_name": "baselines", + "relation_name": "", + "weight": 9.0, + "description": "the experiments consider the baselines as part of their evaluation process", + "source_ids": [ + 145 + ] + }, + { + "src_entity_name": "our experiments", + "tgt_entity_name": "three model configurations", + "relation_name": "", + "weight": 10.0, + "description": "the experiments explicitly consider three model configurations as their primary focus", + "source_ids": [ + 145 + ] + } + ], + "node_idx": 145 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_146.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_146.json new file mode 100644 index 0000000..2ba35cc --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_146.json @@ -0,0 +1,119 @@ +{ + "entities": [ + { + "entity_name": "conventional rag", + "entity_type": "TASK_OR_PROBLEM", + "description": "conventional rag is described as the most common pipeline for document analysis involving text extraction and chunking", + "source_ids": [ + 146 + ] + }, + { + "entity_name": "bm25", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "bm25 is identified as a strong and widely used retrieval model selected for implementation", + "source_ids": [ + 146 + ] + }, + { + "entity_name": "vanilla rag", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "vanilla rag is identified as a strong and widely used retrieval model selected for implementation", + "source_ids": [ + 146 + ] + }, + { + "entity_name": "layout vanilla", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "layout vanilla is a variant of vanilla rag that utilizes document layout analysis for semantic chunking", + "source_ids": [ + 146 + ] + }, + { + "entity_name": "document analysis", + "entity_type": "TASK_OR_PROBLEM", + "description": "document analysis is the general task where raw text is extracted and processed in the described pipeline", + "source_ids": [ + 146 + ] + }, + { + "entity_name": "raw text", + "entity_type": "MATERIAL", + "description": "raw text is the input material that is first extracted in the pipeline", + "source_ids": [ + 146 + ] + }, + { + "entity_name": "segments", + "entity_type": "MEASUREMENT", + "description": "segments are the chunks of specified size that the raw text is divided into", + "source_ids": [ + 146 + ] + }, + { + "entity_name": "document layout analysis", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "document layout analysis is the technique used by layout vanilla for semantic chunking", + "source_ids": [ + 146 + ] + }, + { + "entity_name": "semantic chunking", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "semantic chunking is the process of dividing text into segments based on meaning utilized by layout vanilla", + "source_ids": [ + 146 + ] + } + ], + "relations": [ + { + "src_entity_name": "conventional rag", + "tgt_entity_name": "bm25", + "relation_name": "", + "weight": 9.0, + "description": "conventional rag is the pipeline where bm25 is selected as a retrieval model", + "source_ids": [ + 146 + ] + }, + { + "src_entity_name": "conventional rag", + "tgt_entity_name": "vanilla rag", + "relation_name": "", + "weight": 9.0, + "description": "conventional rag is the pipeline where vanilla rag is selected as a retrieval model", + "source_ids": [ + 146 + ] + }, + { + "src_entity_name": "layout vanilla", + "tgt_entity_name": "vanilla rag", + "relation_name": "", + "weight": 10.0, + "description": "layout vanilla is a variant that builds upon vanilla rag by adding document layout analysis", + "source_ids": [ + 146 + ] + }, + { + "src_entity_name": "layout vanilla", + "tgt_entity_name": "conventional rag", + "relation_name": "", + "weight": 8.0, + "description": "layout vanilla is implemented as part of the conventional rag pipeline described in the text", + "source_ids": [ + 146 + ] + } + ], + "node_idx": 146 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_147.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_147.json new file mode 100644 index 0000000..0833017 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_147.json @@ -0,0 +1,177 @@ +{ + "entities": [ + { + "entity_name": "graph based rag", + "entity_type": "TECHNOLOGY", + "description": "graph based rag is a method that extracts textual content from documents and leverages graph data during retrieval", + "source_ids": [ + 147 + ] + }, + { + "entity_name": "raptor", + "entity_type": "TECHNOLOGY", + "description": "raptor is a specific technology selected as an example of graph based rag methods", + "source_ids": [ + 147 + ] + }, + { + "entity_name": "graphrag", + "entity_type": "TECHNOLOGY", + "description": "graphrag is a specific technology selected as an example of graph based rag methods", + "source_ids": [ + 147 + ] + }, + { + "entity_name": "graphrag global", + "entity_type": "TECHNOLOGY", + "description": "graphrag global is a version of graphrag that employs global search methods", + "source_ids": [ + 147 + ] + }, + { + "entity_name": "graphrag local", + "entity_type": "TECHNOLOGY", + "description": "graphrag local is a version of graphrag that employs local search methods", + "source_ids": [ + 147 + ] + }, + { + "entity_name": "documents", + "entity_type": "PRODUCT", + "description": "documents are the textual content from which graph based rag methods extract information", + "source_ids": [ + 147 + ] + }, + { + "entity_name": "graph data", + "entity_type": "TECHNOLOGY", + "description": "graph data is the type of data leveraged during the retrieval process in graph based rag methods", + "source_ids": [ + 147 + ] + }, + { + "entity_name": "retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval is the process performed by graph based rag methods after extracting textual content", + "source_ids": [ + 147 + ] + }, + { + "entity_name": "global search methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "global search methods are employed by the graphrag global version", + "source_ids": [ + 147 + ] + }, + { + "entity_name": "local search methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "local search methods are employed by the graphrag local version", + "source_ids": [ + 147 + ] + } + ], + "relations": [ + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "raptor", + "relation_name": "", + "weight": 9.0, + "description": "raptor is selected as a specific instance of graph based rag methods", + "source_ids": [ + 147 + ] + }, + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "graphrag", + "relation_name": "", + "weight": 9.0, + "description": "graphrag is selected as a specific instance of graph based rag methods", + "source_ids": [ + 147 + ] + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "graphrag global", + "relation_name": "", + "weight": 10.0, + "description": "graphrag global is a version of graphrag that uses global search methods", + "source_ids": [ + 147 + ] + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "graphrag local", + "relation_name": "", + "weight": 10.0, + "description": "graphrag local is a version of graphrag that uses local search methods", + "source_ids": [ + 147 + ] + }, + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "documents", + "relation_name": "", + "weight": 9.0, + "description": "graph based rag extracts textual content from documents", + "source_ids": [ + 147 + ] + }, + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "graph data", + "relation_name": "", + "weight": 9.0, + "description": "graph based rag leverages graph data during retrieval", + "source_ids": [ + 147 + ] + }, + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 8.0, + "description": "graph based rag performs retrieval as part of its process", + "source_ids": [ + 147 + ] + }, + { + "src_entity_name": "graphrag global", + "tgt_entity_name": "global search methods", + "relation_name": "", + "weight": 10.0, + "description": "graphrag global employs global search methods", + "source_ids": [ + 147 + ] + }, + { + "src_entity_name": "graphrag local", + "tgt_entity_name": "local search methods", + "relation_name": "", + "weight": 10.0, + "description": "graphrag local employs local search methods", + "source_ids": [ + 147 + ] + } + ], + "node_idx": 147 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_148.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_148.json new file mode 100644 index 0000000..722cbb9 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_148.json @@ -0,0 +1,231 @@ +{ + "entities": [ + { + "entity_name": "layoutsegmentedrag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "layoutsegmentedrag is a category of methods that utilize layout analysis to segment document content into discrete structural units", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "mm vanilla", + "entity_type": "PRODUCT", + "description": "mm vanilla is a method that utilizes multi modal embeddings for visual and textual content", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "pageindex", + "entity_type": "PRODUCT", + "description": "pageindex is a method or system referenced as an inspiration for a tree based method", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "treetraverse", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "treetraverse is a tree based method inspired by pageindex where an llm navigates the document s tree structure", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "docetl", + "entity_type": "SOFTWARE", + "description": "docetl is a declarative system for complex document processing", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "graphranker", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "graphranker is a graph based method extended from hipporag that applies personalized pagerank to rank relevant nodes", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "hipporag", + "entity_type": "METHOD_OR_ARCHITECTURE", + "description": "hipporag is a method or architecture from which graphranker is extended", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "personalized pagerank", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "personalized pagerank is a technique applied by graphranker to rank relevant nodes", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "llm", + "entity_type": "TECHNOLOGY", + "description": "llm is a technology used by treetraverse to navigate the document s tree structure", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "page 39", + "entity_type": "PUBLICATION_VENUE", + "description": "page 39 is a citation reference associated with the pageindex method", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "page 47", + "entity_type": "PUBLICATION_VENUE", + "description": "page 47 is a citation reference associated with the docetl system", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "page 19", + "entity_type": "PUBLICATION_VENUE", + "description": "page 19 is a citation reference associated with the hipporag method", + "source_ids": [ + 148 + ] + }, + { + "entity_name": "page 20", + "entity_type": "PUBLICATION_VENUE", + "description": "page 20 is a citation reference associated with the personalized pagerank technique", + "source_ids": [ + 148 + ] + } + ], + "relations": [ + { + "src_entity_name": "layoutsegmentedrag", + "tgt_entity_name": "mm vanilla", + "relation_name": "", + "weight": 9.0, + "description": "mm vanilla is included as a method within the layoutsegmentedrag category", + "source_ids": [ + 148 + ] + }, + { + "src_entity_name": "layoutsegmentedrag", + "tgt_entity_name": "treetraverse", + "relation_name": "", + "weight": 9.0, + "description": "treetraverse is included as a method within the layoutsegmentedrag category", + "source_ids": [ + 148 + ] + }, + { + "src_entity_name": "layoutsegmentedrag", + "tgt_entity_name": "docetl", + "relation_name": "", + "weight": 9.0, + "description": "docetl is included as a method within the layoutsegmentedrag category", + "source_ids": [ + 148 + ] + }, + { + "src_entity_name": "layoutsegmentedrag", + "tgt_entity_name": "graphranker", + "relation_name": "", + "weight": 9.0, + "description": "graphranker is included as a method within the layoutsegmentedrag category", + "source_ids": [ + 148 + ] + }, + { + "src_entity_name": "treetraverse", + "tgt_entity_name": "pageindex", + "relation_name": "", + "weight": 8.0, + "description": "treetraverse is inspired by pageindex", + "source_ids": [ + 148 + ] + }, + { + "src_entity_name": "treetraverse", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 7.0, + "description": "treetraverse uses an llm to navigate the document s tree structure", + "source_ids": [ + 148 + ] + }, + { + "src_entity_name": "graphranker", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "graphranker is extended from hipporag", + "source_ids": [ + 148 + ] + }, + { + "src_entity_name": "graphranker", + "tgt_entity_name": "personalized pagerank", + "relation_name": "", + "weight": 9.0, + "description": "graphranker applies personalized pagerank to rank relevant nodes", + "source_ids": [ + 148 + ] + }, + { + "src_entity_name": "pageindex", + "tgt_entity_name": "page 39", + "relation_name": "", + "weight": 5.0, + "description": "pageindex is referenced in citation page 39", + "source_ids": [ + 148 + ] + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "page 47", + "relation_name": "", + "weight": 5.0, + "description": "docetl is referenced in citation page 47", + "source_ids": [ + 148 + ] + }, + { + "src_entity_name": "hipporag", + "tgt_entity_name": "page 19", + "relation_name": "", + "weight": 5.0, + "description": "hipporag is referenced in citation page 19", + "source_ids": [ + 148 + ] + }, + { + "src_entity_name": "personalized pagerank", + "tgt_entity_name": "page 20", + "relation_name": "", + "weight": 5.0, + "description": "personalized pagerank is referenced in citation page 20", + "source_ids": [ + 148 + ] + } + ], + "node_idx": 148 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_149.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_149.json new file mode 100644 index 0000000..4d6d3ea --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_149.json @@ -0,0 +1,237 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a system or method being compared against baseline methods in the text", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "qwen family", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "the qwen family refers to a set of state of the art backbone models used to power bookrag and baseline methods", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "mineru", + "entity_type": "SOFTWARE", + "description": "mineru is a tool employed for robust document layout parsing", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "github com sam234990 bookrag", + "entity_type": "LOCATION", + "description": "github com sam234990 bookrag is the url where source code prompts and configurations for bookrag are available", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "0 6", + "entity_type": "MEASUREMENT", + "description": "0 6 is the threshold value set for the gradient g in the implementation details", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "technical report", + "entity_type": "PUBLICATION_VENUE", + "description": "the technical report is a document containing more details about the implementation referenced as 57", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "gradient g", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "gradient g is a parameter with a threshold set to 0 6 in the implementation details", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "appendix", + "entity_type": "SECTION_TITLE", + "description": "the appendix is a section of the technical report where more details are provided", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "prompts", + "entity_type": "TASK_OR_PROBLEM", + "description": "prompts are specific instructions or inputs used in the bookrag system available on github", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "detailed configurations", + "entity_type": "TASK_OR_PROBLEM", + "description": "detailed configurations are specific settings for the bookrag system available on github", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "state of theart", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "state of theart describes the quality of the backbone models used in the comparison", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "robust document layout parsing", + "entity_type": "TASK_OR_PROBLEM", + "description": "robust document layout parsing is the specific task performed by mineru", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "fair comparison", + "entity_type": "TASK_OR_PROBLEM", + "description": "fair comparison is the goal of the experimental setup described in the text", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "baseline methods", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 149 + ] + }, + { + "entity_name": "implementation details", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 149 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "qwen family", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is powered by backbone models from the qwen family", + "source_ids": [ + 149 + ] + }, + { + "src_entity_name": "baseline methods", + "tgt_entity_name": "qwen family", + "relation_name": "", + "weight": 9.0, + "description": "baseline methods are also powered by backbone models from the qwen family", + "source_ids": [ + 149 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "mineru", + "relation_name": "", + "weight": 8.0, + "description": "bookrag employs mineru for document layout parsing", + "source_ids": [ + 149 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "github com sam234990 bookrag", + "relation_name": "", + "weight": 10.0, + "description": "the source code and configurations for bookrag are available at the specified github location", + "source_ids": [ + 149 + ] + }, + { + "src_entity_name": "implementation details", + "tgt_entity_name": "technical report", + "relation_name": "", + "weight": 7.0, + "description": "more details about the implementation are provided in the technical report", + "source_ids": [ + 149 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "gradient g", + "relation_name": "", + "weight": 8.0, + "description": "bookrag s implementation sets the threshold of gradient g as 0 6", + "source_ids": [ + 149 + ] + }, + { + "src_entity_name": "technical report", + "tgt_entity_name": "appendix", + "relation_name": "", + "weight": 9.0, + "description": "the appendix is a section within the technical report containing more details", + "source_ids": [ + 149 + ] + }, + { + "src_entity_name": "github com sam234990 bookrag", + "tgt_entity_name": "prompts", + "relation_name": "", + "weight": 10.0, + "description": "prompts are available at the specified github location", + "source_ids": [ + 149 + ] + }, + { + "src_entity_name": "github com sam234990 bookrag", + "tgt_entity_name": "detailed configurations", + "relation_name": "", + "weight": 10.0, + "description": "detailed configurations are available at the specified github location", + "source_ids": [ + 149 + ] + }, + { + "src_entity_name": "mineru", + "tgt_entity_name": "robust document layout parsing", + "relation_name": "", + "weight": 10.0, + "description": "mineru is employed to perform robust document layout parsing", + "source_ids": [ + 149 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "baseline methods", + "relation_name": "", + "weight": 9.0, + "description": "bookrag and baseline methods are compared fairly using the same backbone models", + "source_ids": [ + 149 + ] + } + ], + "node_idx": 149 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_15.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_15.json new file mode 100644 index 0000000..e159da8 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_15.json @@ -0,0 +1,367 @@ +{ + "entities": [ + { + "entity_name": "rag", + "entity_type": "TECHNOLOGY", + "description": "rag refers to retrieval augmented generation approaches for document level qa mentioned in the text", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "ocr", + "entity_type": "TECHNOLOGY", + "description": "ocr stands for optical character recognition a technology used to convert documents into plain text", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "graph based rag", + "entity_type": "TECHNOLOGY", + "description": "graph based rag is a text based rag method that uses graph data as an external knowledge source", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "graphrag", + "entity_type": "PRODUCT", + "description": "graphrag is a representative method that constructs a knowledge graph from a textual corpus", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "raptor", + "entity_type": "PRODUCT", + "description": "raptor is a representative method that builds a recursive tree structure by clustering document chunks", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "leiden community detection algorithm", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the leiden community detection algorithm is used by graphrag to obtain hierarchical clusters", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "figure 1", + "entity_type": "IMAGE", + "description": "figure 1 illustrates the two paradigms of existing rag approaches for document level qa", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "table 1", + "entity_type": "TABLE", + "description": "table 1 lists two representative methods graphrag and raptor", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "document level qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "document level qa is the specific task for which existing rag approaches are designed", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "plain text", + "entity_type": "MATERIAL", + "description": "plain text is the output format produced by ocr when converting documents", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "text based rag method", + "entity_type": "TECHNOLOGY", + "description": "text based rag methods are a category of approaches applied after ocr conversion", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "graph data", + "entity_type": "DATASET_OR_CORPUS", + "description": "graph data serves as an external knowledge source capturing semantic information and relational structures", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "knowledge graph", + "entity_type": "DATASET_OR_CORPUS", + "description": "a knowledge graph kg is constructed from a textual corpus by graphrag", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "textual corpus", + "entity_type": "DATASET_OR_CORPUS", + "description": "a textual corpus is the source material from which graphrag constructs a knowledge graph", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "hierarchical clusters", + "entity_type": "TASK_OR_PROBLEM", + "description": "hierarchical clusters are the result of applying the leiden community detection algorithm", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "summaries", + "entity_type": "PRODUCT", + "description": "summaries are generated for each community to provide a global overview of the corpus", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "recursive tree structure", + "entity_type": "TASK_OR_PROBLEM", + "description": "a recursive tree structure is built by raptor through iterative clustering and summarization", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "document chunks", + "entity_type": "DATASET_OR_CORPUS", + "description": "document chunks are the units iteratively clustered by raptor", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "fine grained semantic information", + "entity_type": "CONCEPT", + "description": "fine grained semantic information is a type of data captured by raptor across the corpus", + "source_ids": [ + 15 + ] + }, + { + "entity_name": "high level semantic information", + "entity_type": "CONCEPT", + "description": "high level semantic information is a type of data captured by raptor across the corpus", + "source_ids": [ + 15 + ] + } + ], + "relations": [ + { + "src_entity_name": "rag", + "tgt_entity_name": "ocr", + "relation_name": "", + "weight": 8.0, + "description": "rag approaches generally rely on ocr to convert documents into plain text before application", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "rag", + "tgt_entity_name": "graph based rag", + "relation_name": "", + "weight": 9.0, + "description": "state of the art rag methods increasingly adopt graph based rag approaches", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "leiden community detection algorithm", + "relation_name": "", + "weight": 10.0, + "description": "graphrag applies the leiden community detection algorithm to obtain hierarchical clusters", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "table 1", + "relation_name": "", + "weight": 9.0, + "description": "graphrag is listed as a representative method in table 1", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "raptor", + "tgt_entity_name": "table 1", + "relation_name": "", + "weight": 9.0, + "description": "raptor is listed as a representative method in table 1", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "figure 1", + "tgt_entity_name": "rag", + "relation_name": "", + "weight": 8.0, + "description": "figure 1 illustrates the existing rag approaches for document level qa", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "rag", + "tgt_entity_name": "document level qa", + "relation_name": "", + "weight": 10.0, + "description": "rag approaches are designed for document level qa tasks", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "ocr", + "tgt_entity_name": "plain text", + "relation_name": "", + "weight": 10.0, + "description": "ocr converts documents into plain text", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "text based rag method", + "tgt_entity_name": "graph based rag", + "relation_name": "", + "weight": 9.0, + "description": "graph based rag is a specific type of text based rag method", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "graph based rag", + "tgt_entity_name": "graph data", + "relation_name": "", + "weight": 9.0, + "description": "graph based rag uses graph data as an external knowledge source", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "knowledge graph", + "relation_name": "", + "weight": 10.0, + "description": "graphrag constructs a knowledge graph from a textual corpus", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "textual corpus", + "relation_name": "", + "weight": 9.0, + "description": "graphrag uses a textual corpus as the source for constructing a knowledge graph", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "leiden community detection algorithm", + "tgt_entity_name": "hierarchical clusters", + "relation_name": "", + "weight": 10.0, + "description": "the leiden community detection algorithm produces hierarchical clusters", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "graphrag", + "tgt_entity_name": "summaries", + "relation_name": "", + "weight": 9.0, + "description": "graphrag generates summaries for each community", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "raptor", + "tgt_entity_name": "recursive tree structure", + "relation_name": "", + "weight": 10.0, + "description": "raptor builds a recursive tree structure", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "raptor", + "tgt_entity_name": "document chunks", + "relation_name": "", + "weight": 10.0, + "description": "raptor iteratively clusters document chunks", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "raptor", + "tgt_entity_name": "fine grained semantic information", + "relation_name": "", + "weight": 9.0, + "description": "raptor captures fine grained semantic information across the corpus", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "raptor", + "tgt_entity_name": "high level semantic information", + "relation_name": "", + "weight": 9.0, + "description": "raptor captures high level semantic information across the corpus", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "table 1", + "tgt_entity_name": "graphrag", + "relation_name": "", + "weight": 9.0, + "description": "table 1 lists graphrag as a representative method", + "source_ids": [ + 15 + ] + }, + { + "src_entity_name": "table 1", + "tgt_entity_name": "raptor", + "relation_name": "", + "weight": 9.0, + "description": "table 1 lists raptor as a representative method", + "source_ids": [ + 15 + ] + } + ], + "node_idx": 15 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_150.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_150.json new file mode 100644 index 0000000..368566d --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_150.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "6.2 overall results", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experiments' within the BookRAG paper, this section presents the aggregate performance metrics comparing the proposed method against baseline approaches on document QA tasks.", + "source_ids": [ + 150 + ] + } + ], + "relations": [], + "node_idx": 150 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_151.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_151.json new file mode 100644 index 0000000..db44e4f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_151.json @@ -0,0 +1,115 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a system being evaluated for its qa performance retrieval effectiveness and query efficiency", + "source_ids": [ + 151 + ] + }, + { + "entity_name": "state of the art baselines", + "entity_type": "PRODUCT", + "description": "state of the art baselines are the existing systems used for comparison in the evaluation of bookrag", + "source_ids": [ + 151 + ] + }, + { + "entity_name": "qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "qa refers to the question answering task being evaluated for performance", + "source_ids": [ + 151 + ] + }, + { + "entity_name": "retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval refers to the effectiveness of retrieving information which is being evaluated", + "source_ids": [ + 151 + ] + }, + { + "entity_name": "query efficiency", + "entity_type": "TASK_OR_PROBLEM", + "description": "query efficiency is a metric being analyzed to determine the system s performance", + "source_ids": [ + 151 + ] + }, + { + "entity_name": "evaluation", + "entity_type": "EVENT", + "description": "evaluation is the comprehensive process of analyzing bookrag s performance described in the text", + "source_ids": [ + 151 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "state of the art baselines", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is being compared against state of the art baselines to analyze its performance", + "source_ids": [ + 151 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "qa", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is being analyzed for its complex qa performance", + "source_ids": [ + 151 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is being analyzed for its retrieval effectiveness", + "source_ids": [ + 151 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "query efficiency", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is being analyzed for its query efficiency", + "source_ids": [ + 151 + ] + }, + { + "src_entity_name": "evaluation", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "the evaluation is the process being conducted on bookrag", + "source_ids": [ + 151 + ] + }, + { + "src_entity_name": "evaluation", + "tgt_entity_name": "state of the art baselines", + "relation_name": "", + "weight": 8.0, + "description": "the evaluation involves comparing bookrag to state of the art baselines", + "source_ids": [ + 151 + ] + } + ], + "node_idx": 151 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_152.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_152.json new file mode 100644 index 0000000..5659d00 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_152.json @@ -0,0 +1,417 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a system compared against baselines that achieves state of the art qa performance", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "table 5", + "entity_type": "TABLE", + "description": "table 5 is the location where the comparison of qa performance between bookrag and baselines is shown", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "m3docvqa", + "entity_type": "DATASET_OR_CORPUS", + "description": "m3docvqa is a dataset used to evaluate the exact match performance of bookrag", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "layout vanilla", + "entity_type": "PRODUCT", + "description": "layout vanilla is a baseline method that consistently outperforms vanilla rag", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "vanilla rag", + "entity_type": "PRODUCT", + "description": "vanilla rag is a baseline method that is outperformed by layout vanilla", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "tree traverse", + "entity_type": "PRODUCT", + "description": "tree traverse is a method highlighted for having suboptimal results due to limitations in hierarchical navigation", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "graphranker", + "entity_type": "PRODUCT", + "description": "graphranker is a method highlighted for having suboptimal results due to limitations in graph based reasoning", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "tree graph bookindex", + "entity_type": "PRODUCT", + "description": "tree graph bookindex is a component of bookrag that contributes to its superior performance", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "agent based planning", + "entity_type": "PRODUCT", + "description": "agent based planning is a component of bookrag that contributes to its superior performance", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "18 0", + "entity_type": "PERCENTAGE", + "description": "18 0 is the margin by which bookrag outperforms the top performing baseline in exact match on m3docvqa", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "qa performance", + "entity_type": "TASK_OR_PROBLEM", + "description": "qa performance is the specific task being evaluated and compared in the text", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "exact match", + "entity_type": "EVALUATION_METRIC", + "description": "exact match is the metric used to measure the performance of bookrag against baselines", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "hierarchical navigation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "hierarchical navigation is a method used by tree traverse that is noted for missing cross sectional context", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "graph based reasoning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "graph based reasoning is a method used by graphranker that is noted for drifting into irrelevant scopes", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "context fragmentation", + "entity_type": "TASK_OR_PROBLEM", + "description": "context fragmentation is a limitation of existing baselines that bookrag overcomes", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "static query workflow", + "entity_type": "TASK_OR_PROBLEM", + "description": "static query workflow is a limitation of existing baselines that bookrag overcomes", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "cross sectional context", + "entity_type": "CONCEPT", + "description": "cross sectional context is information often missed by methods relying solely on hierarchical navigation", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "irrelevant scopes", + "entity_type": "CONCEPT", + "description": "irrelevant scopes are areas that methods relying solely on graph based reasoning may drift into", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval is the process of finding evidence which is improved by layout parsing", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "generation is the process of creating output which is made accurate by bookrag", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "queries", + "entity_type": "CONCEPT", + "description": "queries are inputs that bookrag effectively classifies to configure optimal workflows", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "workflows", + "entity_type": "CONCEPT", + "description": "workflows are configured by bookrag to ensure precise evidence retrieval and accurate generation", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "baselines", + "entity_type": "PRODUCT", + "description": "baselines are the three categories of methods against which bookrag is compared", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "top performing baseline", + "entity_type": "PRODUCT", + "description": "top performing baseline is the specific baseline that bookrag substantially outperforms", + "source_ids": [ + 152 + ] + }, + { + "entity_name": "existing baselines", + "entity_type": "PRODUCT", + "description": "existing baselines are the methods that suffer from context fragmentation and static query workflows", + "source_ids": [ + 152 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "table 5", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s qa performance is presented and compared in table 5", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "m3docvqa", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is evaluated on the m3docvqa dataset where it achieves a specific performance margin", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "18 0", + "relation_name": "", + "weight": 10.0, + "description": "bookrag outperforms the top baseline by 18 0 on the m3docvqa dataset", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "layout vanilla", + "tgt_entity_name": "vanilla rag", + "relation_name": "", + "weight": 9.0, + "description": "layout vanilla consistently outperforms vanilla rag in the comparison", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "tree graph bookindex", + "relation_name": "", + "weight": 10.0, + "description": "bookrag s superiority stems from the synergy of its unified tree graph bookindex", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 10.0, + "description": "bookrag s superiority stems from the synergy of its agent based planning", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "tree traverse", + "tgt_entity_name": "graphranker", + "relation_name": "", + "weight": 7.0, + "description": "both tree traverse and graphranker are highlighted for having suboptimal results due to similar limitations", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "qa performance", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is the subject of the qa performance evaluation described in the text", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "exact match", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s performance is measured using the exact match metric on m3docvqa", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 9.0, + "description": "bookrag ensures precise evidence retrieval by overcoming limitations of existing baselines", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 9.0, + "description": "bookrag ensures accurate generation by overcoming limitations of existing baselines", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "queries", + "relation_name": "", + "weight": 9.0, + "description": "bookrag effectively classifies queries to configure optimal workflows", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "workflows", + "relation_name": "", + "weight": 9.0, + "description": "bookrag configures optimal workflows to improve retrieval and generation", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "layout vanilla", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 8.0, + "description": "layout vanilla preserves essential structural information for better retrieval", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "tree traverse", + "tgt_entity_name": "hierarchical navigation", + "relation_name": "", + "weight": 9.0, + "description": "tree traverse relies on hierarchical navigation which leads to suboptimal results", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "graphranker", + "tgt_entity_name": "graph based reasoning", + "relation_name": "", + "weight": 9.0, + "description": "graphranker relies on graph based reasoning which leads to suboptimal results", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "tree traverse", + "tgt_entity_name": "cross sectional context", + "relation_name": "", + "weight": 8.0, + "description": "tree traverse often misses cross sectional context due to its reliance on hierarchical navigation", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "graphranker", + "tgt_entity_name": "irrelevant scopes", + "relation_name": "", + "weight": 8.0, + "description": "graphranker often drifts into irrelevant scopes due to its reliance on graph based reasoning", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "context fragmentation", + "relation_name": "", + "weight": 9.0, + "description": "bookrag overcomes the limitation of context fragmentation found in existing baselines", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "static query workflow", + "relation_name": "", + "weight": 9.0, + "description": "bookrag overcomes the limitation of static query workflow found in existing baselines", + "source_ids": [ + 152 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "top performing baseline", + "relation_name": "", + "weight": 10.0, + "description": "bookrag substantially outperforms the top performing baseline by 18 0", + "source_ids": [ + 152 + ] + } + ], + "node_idx": 152 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_153.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_153.json new file mode 100644 index 0000000..1a11591 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_153.json @@ -0,0 +1,229 @@ +{ + "entities": [ + { + "entity_name": "table 5", + "entity_type": "TABLE", + "description": "table 5 is a performance comparison table showing results of different methods on document qa tasks", + "source_ids": [ + 153 + ] + }, + { + "entity_name": "performance comparison", + "entity_type": "TASK_OR_PROBLEM", + "description": "performance comparison refers to the evaluation of different methods across various datasets", + "source_ids": [ + 153 + ] + }, + { + "entity_name": "different methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "different methods are the various approaches being compared in the table for solving document qa tasks", + "source_ids": [ + 153 + ] + }, + { + "entity_name": "datasets", + "entity_type": "DATASET_OR_CORPUS", + "description": "datasets are the various collections of data used to evaluate the performance of the methods", + "source_ids": [ + 153 + ] + }, + { + "entity_name": "complex document qa tasks", + "entity_type": "TASK_OR_PROBLEM", + "description": "complex document qa tasks are the specific problems being solved by the methods in the comparison", + "source_ids": [ + 153 + ] + }, + { + "entity_name": "best results", + "entity_type": "EVALUATION_METRIC", + "description": "best results refer to the top performing outcomes marked in bold in the table", + "source_ids": [ + 153 + ] + }, + { + "entity_name": "second best results", + "entity_type": "EVALUATION_METRIC", + "description": "second best results refer to the runner up outcomes marked in underlined in the table", + "source_ids": [ + 153 + ] + }, + { + "entity_name": "bold", + "entity_type": "COLOR", + "description": "bold refers to the text formatting style used to mark the best results in the table", + "source_ids": [ + 153 + ] + }, + { + "entity_name": "underlined", + "entity_type": "SHAPE", + "description": "underlined refers to the text formatting style used to mark the second best results in the table", + "source_ids": [ + 153 + ] + } + ], + "relations": [ + { + "src_entity_name": "table 5", + "tgt_entity_name": "performance comparison", + "relation_name": "", + "weight": 10.0, + "description": "table 5 presents the performance comparison of different methods", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "different methods", + "relation_name": "", + "weight": 10.0, + "description": "table 5 compares the performance of different methods", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "datasets", + "relation_name": "", + "weight": 10.0, + "description": "table 5 evaluates methods across various datasets", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 10.0, + "description": "table 5 focuses on solving complex document qa tasks", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "best results", + "relation_name": "", + "weight": 9.0, + "description": "table 5 marks the best results in bold", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "second best results", + "relation_name": "", + "weight": 9.0, + "description": "table 5 marks the second best results in underlined", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "performance comparison", + "tgt_entity_name": "different methods", + "relation_name": "", + "weight": 8.0, + "description": "the performance comparison involves evaluating different methods", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "performance comparison", + "tgt_entity_name": "datasets", + "relation_name": "", + "weight": 8.0, + "description": "the performance comparison is conducted across various datasets", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "performance comparison", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 8.0, + "description": "the performance comparison is aimed at solving complex document qa tasks", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "different methods", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 9.0, + "description": "different methods are used to solve complex document qa tasks", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "datasets", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 8.0, + "description": "datasets are used to evaluate methods for complex document qa tasks", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "best results", + "tgt_entity_name": "bold", + "relation_name": "", + "weight": 10.0, + "description": "the best results are identified by being marked in bold", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "second best results", + "tgt_entity_name": "underlined", + "relation_name": "", + "weight": 10.0, + "description": "the second best results are identified by being marked as underlined", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "bold", + "relation_name": "", + "weight": 9.0, + "description": "table 5 uses bold formatting to highlight specific results", + "source_ids": [ + 153 + ] + }, + { + "src_entity_name": "table 5", + "tgt_entity_name": "underlined", + "relation_name": "", + "weight": 9.0, + "description": "table 5 uses underlined formatting to highlight specific results", + "source_ids": [ + 153 + ] + } + ], + "node_idx": 153 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_154.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_154.json new file mode 100644 index 0000000..33da272 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_154.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "table: cref='#/texts/156'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/156'", + "source_ids": [ + 154 + ] + }, + { + "entity_name": "cref", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A reference identifier or cross-reference key found in the description text, pointing to a specific text location ('#/texts/156').", + "source_ids": [ + 154 + ] + } + ], + "relations": [ + { + "src_entity_name": "table: cref='#/texts/156'...", + "tgt_entity_name": "cref", + "relation_name": "", + "weight": 9.0, + "description": "Table 'Table: cref='#/texts/156'...' contains data about 'cref'.", + "source_ids": [ + 154 + ] + } + ], + "node_idx": 154 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_155.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_155.json new file mode 100644 index 0000000..558147e --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_155.json @@ -0,0 +1,61 @@ +{ + "entities": [ + { + "entity_name": "table 6", + "entity_type": "TABLE", + "description": "table 6 is a table presenting a comparison of retrieval recall among layout based methods", + "source_ids": [ + 155 + ] + }, + { + "entity_name": "retrieval recall", + "entity_type": "EVALUATION_METRIC", + "description": "retrieval recall is the metric being compared among the layout based methods in the text", + "source_ids": [ + 155 + ] + }, + { + "entity_name": "layout based methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "layout based methods are the techniques being evaluated for their retrieval recall performance", + "source_ids": [ + 155 + ] + } + ], + "relations": [ + { + "src_entity_name": "table 6", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 10.0, + "description": "table 6 displays the comparison results for the retrieval recall metric", + "source_ids": [ + 155 + ] + }, + { + "src_entity_name": "table 6", + "tgt_entity_name": "layout based methods", + "relation_name": "", + "weight": 10.0, + "description": "table 6 compares the performance of various layout based methods", + "source_ids": [ + 155 + ] + }, + { + "src_entity_name": "retrieval recall", + "tgt_entity_name": "layout based methods", + "relation_name": "", + "weight": 9.0, + "description": "retrieval recall is the specific metric used to evaluate the layout based methods", + "source_ids": [ + 155 + ] + } + ], + "node_idx": 155 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_156.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_156.json new file mode 100644 index 0000000..2f1daeb --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_156.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "cref='#/texts/158'", + "entity_type": "TABLE", + "description": "A table entity identified by the reference string provided in the description, representing a specific text section or data block.", + "source_ids": [ + 156 + ] + } + ], + "relations": [], + "node_idx": 156 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_157.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_157.json new file mode 100644 index 0000000..df85f45 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_157.json @@ -0,0 +1,381 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a retrieval system evaluated for its performance against other baselines", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "m3docvqa", + "entity_type": "DATASET_OR_CORPUS", + "description": "m3docvqa is a dataset used to evaluate the retrieval recall of bookrag", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "graphranker", + "entity_type": "PRODUCT", + "description": "graphranker is a layout based baseline system compared against bookrag", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "ift inspired selector reasoner workflow", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the ift inspired selector reasoner workflow is the process used by bookrag to classify queries and analyze information", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "agent based planning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "agent based planning is a component of the workflow that classifies the query", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "selector", + "entity_type": "SOFTWARE", + "description": "the selector is a component that narrows the search to a precise information patch", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "reasoner", + "entity_type": "SOFTWARE", + "description": "the reasoner is a component that performs analysis on the selected information", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "skyline ranker", + "entity_type": "SOFTWARE", + "description": "skyline ranker is a process that retains a specific number of nodes after analysis", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "71 2", + "entity_type": "PERCENTAGE", + "description": "71 2 is the retrieval recall achieved by bookrag on the m3docvqa dataset", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "44 5", + "entity_type": "PERCENTAGE", + "description": "44 5 is the maximum retrieval recall achieved by the graphranker baseline", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "9 87", + "entity_type": "MEASUREMENT", + "description": "9 87 is the average number of retained nodes on one of the three datasets after the skyline ranker process", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "6 86", + "entity_type": "MEASUREMENT", + "description": "6 86 is the average number of retained nodes on another of the three datasets after the skyline ranker process", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "8 6", + "entity_type": "MEASUREMENT", + "description": "8 6 is the average number of retained nodes on the third dataset after the skyline ranker process", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "10", + "entity_type": "MEASUREMENT", + "description": "10 is the value of k used in the standard top k setting for comparison", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "retrieval performance", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval performance is the specific metric being evaluated to validate the retrieval design of bookrag", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "retrieval recall", + "entity_type": "EVALUATION_METRIC", + "description": "retrieval recall is the specific performance metric used to compare bookrag against other baselines", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "ground truth layout blocks", + "entity_type": "DATASET_OR_CORPUS", + "description": "ground truth layout blocks are the reference data used to evaluate the retrieval recall", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "layout based baselines", + "entity_type": "PRODUCT", + "description": "layout based baselines are the group of systems against which bookrag is compared", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "query", + "entity_type": "TASK_OR_PROBLEM", + "description": "the query is the input that is classified by the agent based planning component", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "information patch", + "entity_type": "TASK_OR_PROBLEM", + "description": "the information patch is the precise data segment targeted by the selector component", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "candidate size", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "candidate size is the variable representing the number of candidates which is kept from inflating by the skyline ranker process", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "three datasets", + "entity_type": "DATASET_OR_CORPUS", + "description": "three datasets are the collective group of data used to measure the average number of retained nodes", + "source_ids": [ + 157 + ] + }, + { + "entity_name": "standard top k setting", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the standard top k setting is the baseline configuration used for comparison with the skyline ranker results", + "source_ids": [ + 157 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "m3docvqa", + "relation_name": "", + "weight": 10.0, + "description": "bookrag achieves a 71 2 recall on the m3docvqa dataset", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "graphranker", + "relation_name": "", + "weight": 9.0, + "description": "bookrag significantly outperforms graphranker in retrieval recall", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "ift inspired selector reasoner workflow", + "relation_name": "", + "weight": 10.0, + "description": "the performance advantage of bookrag stems from its ift inspired selector reasoner workflow", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "ift inspired selector reasoner workflow", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 9.0, + "description": "the workflow includes agent based planning which classifies the query", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "ift inspired selector reasoner workflow", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 9.0, + "description": "the workflow uses the selector to narrow the search to a precise information patch", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "ift inspired selector reasoner workflow", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 9.0, + "description": "the workflow uses the reasoner for analysis after the selector", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "9 87", + "relation_name": "", + "weight": 8.0, + "description": "the skyline ranker process results in an average of 9 87 retained nodes on one dataset", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "6 86", + "relation_name": "", + "weight": 8.0, + "description": "the skyline ranker process results in an average of 6 86 retained nodes on another dataset", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "8 6", + "relation_name": "", + "weight": 8.0, + "description": "the skyline ranker process results in an average of 8 6 retained nodes on the third dataset", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "10", + "relation_name": "", + "weight": 7.0, + "description": "the number of retained nodes by skyline ranker is comparable to the standard top k setting where k 10", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval performance", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s retrieval performance is the subject of the validation described in the text", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 10.0, + "description": "bookrag s retrieval recall is the specific metric measured to demonstrate its performance", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "ground truth layout blocks", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is evaluated against ground truth layout blocks to validate its design", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "layout based baselines", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is evaluated against layout based baselines to demonstrate its superiority", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "ift inspired selector reasoner workflow", + "tgt_entity_name": "query", + "relation_name": "", + "weight": 8.0, + "description": "the workflow s agent based planning component classifies the query", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "information patch", + "relation_name": "", + "weight": 9.0, + "description": "the selector narrows the search to a precise information patch", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "candidate size", + "relation_name": "", + "weight": 8.0, + "description": "the skyline ranker process ensures high quality retrieval without inflating the candidate size", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "three datasets", + "relation_name": "", + "weight": 9.0, + "description": "the average number of retained nodes by skyline ranker is measured across three datasets", + "source_ids": [ + 157 + ] + }, + { + "src_entity_name": "skyline ranker", + "tgt_entity_name": "standard top k setting", + "relation_name": "", + "weight": 8.0, + "description": "the results of the skyline ranker process are compared to the standard top k setting", + "source_ids": [ + 157 + ] + } + ], + "node_idx": 157 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_158.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_158.json new file mode 100644 index 0000000..2efafbc --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_158.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "figure 5", + "entity_type": "IMAGE", + "description": "figure 5 is an image in the text that presents a comparison of query efficiency", + "source_ids": [ + 158 + ] + }, + { + "entity_name": "query efficiency", + "entity_type": "EVALUATION_METRIC", + "description": "query efficiency is a metric being compared in the text", + "source_ids": [ + 158 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 5", + "tgt_entity_name": "query efficiency", + "relation_name": "", + "weight": 9.0, + "description": "figure 5 displays a comparison of the query efficiency metric", + "source_ids": [ + 158 + ] + } + ], + "node_idx": 158 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_159.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_159.json new file mode 100644 index 0000000..bcd7443 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_159.json @@ -0,0 +1,357 @@ +{ + "entities": [ + { + "entity_name": "figure 5", + "entity_type": "IMAGE", + "description": "A figure comparing the query efficiency of various RAG (Retrieval-Augmented Generation) methods across three datasets, displaying Query Time and Token cost metrics.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "bm25", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A classical probabilistic ranking function used for information retrieval, listed in the chart legend.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "vanilla rag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The baseline Retrieval-Augmented Generation model without additional enhancements, listed in the chart legend.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "layout + vanilla", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A variant of the vanilla RAG method that incorporates layout information, listed in the chart legend.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "raptor", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Recursive Abstractive Processing for Tree-Organized Retrieval, a specific RAG approach listed in the chart legend.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "graphrag-local", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A local graph-based retrieval method, listed in the chart legend.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "graphrag-global", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A global graph-based retrieval method, listed in the chart legend.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "mm-vanilla", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A multi-modal vanilla RAG baseline, listed in the chart legend.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "tree-traverse", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A tree-traversal based retrieval or processing method, listed in the chart legend.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "graphranker", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A ranking method utilizing graph structures, listed in the chart legend.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "docetl", + "entity_type": "SOFTWARE", + "description": "Document Extraction, Transformation, and Loading tool, listed in the chart legend.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A RAG system specifically designed or optimized for book content, listed in the chart legend.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "The first dataset evaluated in the comparison, labeled under section (a).", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "m3docvqa", + "entity_type": "DATASET_OR_CORPUS", + "description": "The second dataset evaluated in the comparison, labeled under section (b).", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "The third dataset evaluated in the comparison, labeled under section (c).", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "query time", + "entity_type": "EVALUATION_METRIC", + "description": "A performance metric measuring the time taken to process a query, displayed on the x-axis of the left charts.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "token cost", + "entity_type": "EVALUATION_METRIC", + "description": "A performance metric measuring the number of tokens consumed, displayed on the x-axis of the right charts.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "time (s)", + "entity_type": "MEASUREMENT", + "description": "The unit of measurement for the y-axis in the Query Time charts, representing seconds.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "token (m)", + "entity_type": "MEASUREMENT", + "description": "The unit of measurement for the y-axis in the Token cost charts, representing millions of tokens.", + "source_ids": [ + 159 + ] + }, + { + "entity_name": "image cref='#/texts/161'", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 159 + ] + } + ], + "relations": [ + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "figure 5", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Figure 5", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "bm25", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to BM25", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "vanilla rag", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Vanilla RAG", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "layout + vanilla", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Layout + Vanilla", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "raptor", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to RAPTOR", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "graphrag-local", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to GraphRAG-Local", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "graphrag-global", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to GraphRAG-Global", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "mm-vanilla", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to MM-Vanilla", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "tree-traverse", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Tree-Traverse", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "graphranker", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to GraphRanker", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "docetl", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to DocETL", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to BookRAG", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to MMLongBench", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "m3docvqa", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to M3DocVQA", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Qasper", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "query time", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Query Time", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "token cost", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to Token cost", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "time (s)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to time (s)", + "source_ids": [ + 159 + ] + }, + { + "src_entity_name": "image cref='#/texts/161'", + "tgt_entity_name": "token (m)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/161' related to token (M)", + "source_ids": [ + 159 + ] + } + ], + "node_idx": 159 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_16.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_16.json new file mode 100644 index 0000000..51659fc --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_16.json @@ -0,0 +1,51 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a method or system being compared against representative methods in the text", + "source_ids": [ + 16 + ] + }, + { + "entity_name": "table 1", + "entity_type": "TABLE", + "description": "table 1 is the section containing the comparison of methods and bookrag", + "source_ids": [ + 16 + ] + }, + { + "entity_name": "representative methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "representative methods are the existing techniques being compared against bookrag in the text", + "source_ids": [ + 16 + ] + } + ], + "relations": [ + { + "src_entity_name": "table 1", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "table 1 contains the comparison data for bookrag", + "source_ids": [ + 16 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "representative methods", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is being compared to representative methods in the text", + "source_ids": [ + 16 + ] + } + ], + "node_idx": 16 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_160.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_160.json new file mode 100644 index 0000000..9b36e87 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_160.json @@ -0,0 +1,205 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a multi modal rag method evaluated for efficiency in terms of query time and token consumption", + "source_ids": [ + 160 + ] + }, + { + "entity_name": "graph based rag methods", + "entity_type": "TECHNOLOGY", + "description": "graph based rag methods are existing methods used as a baseline for comparing bookrag s efficiency", + "source_ids": [ + 160 + ] + }, + { + "entity_name": "text based rag approaches", + "entity_type": "TECHNOLOGY", + "description": "text based rag approaches are methods that generally exhibit lower latency and token usage due to the absence of vlm processing", + "source_ids": [ + 160 + ] + }, + { + "entity_name": "vlm", + "entity_type": "TECHNOLOGY", + "description": "vlm refers to vision language models the processing component absent in purely text based rag approaches", + "source_ids": [ + 160 + ] + }, + { + "entity_name": "docetl", + "entity_type": "PRODUCT", + "description": "docetl is a baseline method against which bookrag s token consumption and query latency are compared", + "source_ids": [ + 160 + ] + }, + { + "entity_name": "mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "mmlongbench is a dataset used to evaluate the token consumption of docetl and bookrag", + "source_ids": [ + 160 + ] + }, + { + "entity_name": "figure 5", + "entity_type": "IMAGE", + "description": "figure 5 is a visual illustration showing the efficiency evaluation of bookrag in terms of query time and token consumption", + "source_ids": [ + 160 + ] + }, + { + "entity_name": "53 million tokens", + "entity_type": "MEASUREMENT", + "description": "53 million tokens is the amount of token consumption recorded for docetl on the mmlongbench dataset", + "source_ids": [ + 160 + ] + }, + { + "entity_name": "5 million", + "entity_type": "MEASUREMENT", + "description": "5 million is the upper limit of token consumption required by bookrag on the mmlongbench dataset", + "source_ids": [ + 160 + ] + }, + { + "entity_name": "2", + "entity_type": "MEASUREMENT", + "description": "2 represents the speedup factor achieved by bookrag compared to docetl in query latency", + "source_ids": [ + 160 + ] + }, + { + "entity_name": "order of magnitude", + "entity_type": "MEASUREMENT", + "description": "order of magnitude describes the scale of reduction in token consumption by bookrag compared to docetl", + "source_ids": [ + 160 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "graph based rag methods", + "relation_name": "", + "weight": 9.0, + "description": "bookrag maintains time and token costs comparable to existing graph based rag methods", + "source_ids": [ + 160 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "text based rag approaches", + "relation_name": "", + "weight": 7.0, + "description": "bookrag maintains a balanced efficiency among multi modal methods compared to text based approaches which have lower latency", + "source_ids": [ + 160 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "vlm", + "relation_name": "", + "weight": 8.0, + "description": "bookrag involves vlm processing for images unlike purely text based rag approaches", + "source_ids": [ + 160 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "docetl", + "relation_name": "", + "weight": 10.0, + "description": "bookrag reduces token consumption by an order of magnitude and achieves a speedup of up to 2x compared to docetl", + "source_ids": [ + 160 + ] + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "docetl consumes over 53 million tokens on the mmlongbench dataset", + "source_ids": [ + 160 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "bookrag requires less than 5 million tokens on the mmlongbench dataset", + "source_ids": [ + 160 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "figure 5", + "relation_name": "", + "weight": 8.0, + "description": "figure 5 illustrates the efficiency evaluation of bookrag", + "source_ids": [ + 160 + ] + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "53 million tokens", + "relation_name": "", + "weight": 10.0, + "description": "docetl consumes 53 million tokens on the mmlongbench dataset", + "source_ids": [ + 160 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "5 million", + "relation_name": "", + "weight": 10.0, + "description": "bookrag requires less than 5 million tokens on the mmlongbench dataset", + "source_ids": [ + 160 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "2", + "relation_name": "", + "weight": 10.0, + "description": "bookrag achieves a speedup of up to 2 compared to docetl", + "source_ids": [ + 160 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "order of magnitude", + "relation_name": "", + "weight": 9.0, + "description": "bookrag reduces token consumption by an order of magnitude compared to docetl", + "source_ids": [ + 160 + ] + } + ], + "node_idx": 160 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_161.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_161.json new file mode 100644 index 0000000..95802fe --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_161.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "10", + "entity_type": "MEASUREMENT", + "description": "10 is a numerical value mentioned in the text potentially representing a measurement or count", + "source_ids": [ + 161 + ] + } + ], + "relations": [], + "node_idx": 161 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_162.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_162.json new file mode 100644 index 0000000..fc33d96 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_162.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "6.3 detailed analysis", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experiments' within the BookRAG paper, this section provides an in-depth comparative analysis of the proposed method against strong baseline methods, specifically focusing on efficiency and accuracy metrics for document QA tasks.", + "source_ids": [ + 162 + ] + } + ], + "relations": [], + "node_idx": 162 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_163.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_163.json new file mode 100644 index 0000000..ae751a4 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_163.json @@ -0,0 +1,141 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a system or product being examined in the text through an ablation study and experiments", + "source_ids": [ + 163 + ] + }, + { + "entity_name": "ablation study", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "ablation study is a method used to validate the contribution of each component of bookrag", + "source_ids": [ + 163 + ] + }, + { + "entity_name": "gradient based er", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "gradient based er is a method used in experiments to analyze its impact on qa performance", + "source_ids": [ + 163 + ] + }, + { + "entity_name": "qa performance", + "entity_type": "EVALUATION_METRIC", + "description": "qa performance is the metric being evaluated in the experiments across different query types", + "source_ids": [ + 163 + ] + }, + { + "entity_name": "entity resolution method", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "entity resolution method is a technique compared for effectiveness in the text", + "source_ids": [ + 163 + ] + }, + { + "entity_name": "case study", + "entity_type": "TASK_OR_PROBLEM", + "description": "case study is a specific analysis presented in the text", + "source_ids": [ + 163 + ] + }, + { + "entity_name": "query types", + "entity_type": "TASK_OR_PROBLEM", + "description": "query types are the different categories of queries used to evaluate qa performance in the experiments", + "source_ids": [ + 163 + ] + }, + { + "entity_name": "error analysis", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "error analysis is a comprehensive method performed to examine the results of the study", + "source_ids": [ + 163 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "ablation study", + "relation_name": "", + "weight": 9.0, + "description": "an ablation study is conducted on bookrag to validate its components", + "source_ids": [ + 163 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "gradient based er", + "relation_name": "", + "weight": 8.0, + "description": "experiments on bookrag involve analyzing the impact of gradient based er", + "source_ids": [ + 163 + ] + }, + { + "src_entity_name": "gradient based er", + "tgt_entity_name": "qa performance", + "relation_name": "", + "weight": 9.0, + "description": "gradient based er is evaluated for its impact on qa performance", + "source_ids": [ + 163 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "entity resolution method", + "relation_name": "", + "weight": 8.0, + "description": "the effectiveness of the entity resolution method is compared in the context of bookrag", + "source_ids": [ + 163 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "case study", + "relation_name": "", + "weight": 7.0, + "description": "a case study is presented as part of the examination of bookrag", + "source_ids": [ + 163 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 8.0, + "description": "experiments on bookrag are conducted across different query types", + "source_ids": [ + 163 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "error analysis", + "relation_name": "", + "weight": 9.0, + "description": "a comprehensive error analysis is performed as part of the examination of bookrag", + "source_ids": [ + 163 + ] + } + ], + "node_idx": 163 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_164.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_164.json new file mode 100644 index 0000000..bb8f86e --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_164.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "ablation study", + "entity_type": "TASK_OR_PROBLEM", + "description": "ablation study is a task designed to evaluate the contribution of core components in bookrag", + "source_ids": [ + 164 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a product or system whose core components are being evaluated through variants", + "source_ids": [ + 164 + ] + } + ], + "relations": [ + { + "src_entity_name": "ablation study", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "the ablation study is conducted to evaluate the core components of bookrag", + "source_ids": [ + 164 + ] + } + ], + "node_idx": 164 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_165.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_165.json new file mode 100644 index 0000000..f714ac1 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_165.json @@ -0,0 +1,79 @@ +{ + "entities": [ + { + "entity_name": "gradient er", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "gradient er is a gradient based entity resolution method mentioned in the text", + "source_ids": [ + 165 + ] + }, + { + "entity_name": "basic er", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "basic er is a method used to merge same name entities replacing gradient er in the described scenario", + "source_ids": [ + 165 + ] + }, + { + "entity_name": "w o gradient er", + "entity_type": "TASK_OR_PROBLEM", + "description": "w o gradient er is a scenario or condition described where the gradient based entity resolution is replaced", + "source_ids": [ + 165 + ] + }, + { + "entity_name": "same name entities", + "entity_type": "TASK_OR_PROBLEM", + "description": "same name entities are the specific entities targeted for merging in the basic er process", + "source_ids": [ + 165 + ] + } + ], + "relations": [ + { + "src_entity_name": "gradient er", + "tgt_entity_name": "basic er", + "relation_name": "", + "weight": 10.0, + "description": "basic er replaces gradient er by merging same name entities", + "source_ids": [ + 165 + ] + }, + { + "src_entity_name": "w o gradient er", + "tgt_entity_name": "gradient er", + "relation_name": "", + "weight": 9.0, + "description": "the w o gradient er scenario involves the replacement of gradient er", + "source_ids": [ + 165 + ] + }, + { + "src_entity_name": "w o gradient er", + "tgt_entity_name": "basic er", + "relation_name": "", + "weight": 9.0, + "description": "the w o gradient er scenario involves the use of basic er as the replacement method", + "source_ids": [ + 165 + ] + }, + { + "src_entity_name": "basic er", + "tgt_entity_name": "same name entities", + "relation_name": "", + "weight": 10.0, + "description": "basic er is the method used to merge same name entities", + "source_ids": [ + 165 + ] + } + ], + "node_idx": 165 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_166.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_166.json new file mode 100644 index 0000000..d8884bb --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_166.json @@ -0,0 +1,49 @@ +{ + "entities": [ + { + "entity_name": "agent based planning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "agent based planning is a method that is removed in the scenario described leading to a default workflow", + "source_ids": [ + 166 + ] + }, + { + "entity_name": "static standard workflow", + "entity_type": "TASK_OR_PROBLEM", + "description": "static standard workflow is the default process used for all queries when agent based planning is removed", + "source_ids": [ + 166 + ] + }, + { + "entity_name": "planning", + "entity_type": "TASK_OR_PROBLEM", + "description": "planning is the specific task or problem component that is removed in the described scenario", + "source_ids": [ + 166 + ] + }, + { + "entity_name": "queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "queries are the inputs for which the workflow is applied either with or without planning", + "source_ids": [ + 166 + ] + } + ], + "relations": [ + { + "src_entity_name": "agent based planning", + "tgt_entity_name": "static standard workflow", + "relation_name": "", + "weight": 9.0, + "description": "removing agent based planning results in the system defaulting to a static standard workflow", + "source_ids": [ + 166 + ] + } + ], + "node_idx": 166 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_167.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_167.json new file mode 100644 index 0000000..418ab66 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_167.json @@ -0,0 +1,69 @@ +{ + "entities": [ + { + "entity_name": "selector", + "entity_type": "TECHNOLOGY", + "description": "selector is a component or operator mentioned in the context of removing it to force reasoners to score all candidate nodes", + "source_ids": [ + 167 + ] + }, + { + "entity_name": "reasoners", + "entity_type": "TECHNOLOGY", + "description": "reasoners are systems or components that score candidate nodes affected by the removal of selector operators", + "source_ids": [ + 167 + ] + }, + { + "entity_name": "candidate nodes", + "entity_type": "TASK_OR_PROBLEM", + "description": "candidate nodes are the items being scored by reasoners when the selector operators are removed", + "source_ids": [ + 167 + ] + }, + { + "entity_name": "selector operators", + "entity_type": "TECHNOLOGY", + "description": "selector operators are specific components that can be removed to alter the behavior of reasoners", + "source_ids": [ + 167 + ] + } + ], + "relations": [ + { + "src_entity_name": "selector", + "tgt_entity_name": "reasoners", + "relation_name": "", + "weight": 9.0, + "description": "the removal of selector operators forces reasoners to score all candidate nodes indicating a direct operational dependency", + "source_ids": [ + 167 + ] + }, + { + "src_entity_name": "selector operators", + "tgt_entity_name": "reasoners", + "relation_name": "", + "weight": 9.0, + "description": "removing selector operators directly changes how reasoners operate by forcing them to score all candidate nodes", + "source_ids": [ + 167 + ] + }, + { + "src_entity_name": "reasoners", + "tgt_entity_name": "candidate nodes", + "relation_name": "", + "weight": 8.0, + "description": "reasoners perform the action of scoring candidate nodes especially when selector operators are absent", + "source_ids": [ + 167 + ] + } + ], + "node_idx": 167 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_168.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_168.json new file mode 100644 index 0000000..f0b7b9f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_168.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "graph reasoning", + "entity_type": "TECHNOLOGY", + "description": "graph reasoning is an operator that when removed disables the skyline ranker", + "source_ids": [ + 168 + ] + }, + { + "entity_name": "skyline ranker", + "entity_type": "SOFTWARE", + "description": "skyline ranker is a component that is disabled when the graph reasoning operator is removed resulting in single dimensional scoring", + "source_ids": [ + 168 + ] + } + ], + "relations": [ + { + "src_entity_name": "graph reasoning", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 10.0, + "description": "the graph reasoning operator enables the skyline ranker removing it disables the skyline ranker", + "source_ids": [ + 168 + ] + } + ], + "node_idx": 168 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_169.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_169.json new file mode 100644 index 0000000..fc29d62 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_169.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "text reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "text reasoning is an operator that is removed in the described scenario", + "source_ids": [ + 169 + ] + }, + { + "entity_name": "skyline ranker", + "entity_type": "SOFTWARE", + "description": "skyline ranker is a component that is disabled when text reasoning is removed relying on graph based scores", + "source_ids": [ + 169 + ] + } + ], + "relations": [ + { + "src_entity_name": "text reasoning", + "tgt_entity_name": "skyline ranker", + "relation_name": "", + "weight": 9.0, + "description": "the removal of the text reasoning operator causes the skyline ranker to be disabled", + "source_ids": [ + 169 + ] + } + ], + "node_idx": 169 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_17.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_17.json new file mode 100644 index 0000000..0a9d5ff --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_17.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "table: cref='#/texts/17'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/17'", + "source_ids": [ + 17 + ] + }, + { + "entity_name": "texts reference", + "entity_type": "SECTION_TITLE", + "description": "A reference identifier pointing to a specific text location within a document structure, indicated by the cref attribute '#/texts/17'.", + "source_ids": [ + 17 + ] + } + ], + "relations": [ + { + "src_entity_name": "table: cref='#/texts/17'...", + "tgt_entity_name": "texts reference", + "relation_name": "", + "weight": 9.0, + "description": "Table 'Table: cref='#/texts/17'...' contains data about 'Texts Reference'.", + "source_ids": [ + 17 + ] + } + ], + "node_idx": 17 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_170.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_170.json new file mode 100644 index 0000000..71519a0 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_170.json @@ -0,0 +1,153 @@ +{ + "entities": [ + { + "entity_name": "table 7", + "entity_type": "TABLE", + "description": "table 7 is a table comparing the qa performance of different variants of bookrag", + "source_ids": [ + 170 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a product or system whose variants are being evaluated for qa performance", + "source_ids": [ + 170 + ] + }, + { + "entity_name": "em", + "entity_type": "EVALUATION_METRIC", + "description": "em stands for exact match an evaluation metric used to measure qa performance", + "source_ids": [ + 170 + ] + }, + { + "entity_name": "f1", + "entity_type": "EVALUATION_METRIC", + "description": "f1 denotes f1 score an evaluation metric used to measure qa performance", + "source_ids": [ + 170 + ] + }, + { + "entity_name": "qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "qa refers to question answering the specific task being evaluated in the text", + "source_ids": [ + 170 + ] + }, + { + "entity_name": "exact match", + "entity_type": "EVALUATION_METRIC", + "description": "exact match is the full name for the metric abbreviated as em", + "source_ids": [ + 170 + ] + }, + { + "entity_name": "f1 score", + "entity_type": "EVALUATION_METRIC", + "description": "f1 score is the full name for the metric abbreviated as f1", + "source_ids": [ + 170 + ] + } + ], + "relations": [ + { + "src_entity_name": "table 7", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "table 7 compares the performance of different variants of bookrag", + "source_ids": [ + 170 + ] + }, + { + "src_entity_name": "table 7", + "tgt_entity_name": "em", + "relation_name": "", + "weight": 9.0, + "description": "table 7 uses em exact match as a metric to evaluate qa performance", + "source_ids": [ + 170 + ] + }, + { + "src_entity_name": "table 7", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 9.0, + "description": "table 7 uses f1 f1 score as a metric to evaluate qa performance", + "source_ids": [ + 170 + ] + }, + { + "src_entity_name": "em", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 8.0, + "description": "em and f1 are both evaluation metrics used together to compare qa performance in table 7", + "source_ids": [ + 170 + ] + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "qa is the task performed by the different variants of bookrag being compared", + "source_ids": [ + 170 + ] + }, + { + "src_entity_name": "exact match", + "tgt_entity_name": "em", + "relation_name": "", + "weight": 10.0, + "description": "exact match is the definition of the abbreviation em", + "source_ids": [ + 170 + ] + }, + { + "src_entity_name": "f1 score", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 10.0, + "description": "f1 score is the definition of the abbreviation f1", + "source_ids": [ + 170 + ] + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "em", + "relation_name": "", + "weight": 9.0, + "description": "em is used to measure the performance of the qa task", + "source_ids": [ + 170 + ] + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 9.0, + "description": "f1 is used to measure the performance of the qa task", + "source_ids": [ + 170 + ] + } + ], + "node_idx": 170 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_171.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_171.json new file mode 100644 index 0000000..86d3a0b --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_171.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "table: cref='#/texts/220'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/220'", + "source_ids": [ + 171 + ] + }, + { + "entity_name": "cref", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A reference identifier or cross-reference key found in the description text, pointing to a specific text location ('#/texts/220').", + "source_ids": [ + 171 + ] + } + ], + "relations": [ + { + "src_entity_name": "table: cref='#/texts/220'...", + "tgt_entity_name": "cref", + "relation_name": "", + "weight": 9.0, + "description": "Table 'Table: cref='#/texts/220'...' contains data about 'cref'.", + "source_ids": [ + 171 + ] + } + ], + "node_idx": 171 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_172.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_172.json new file mode 100644 index 0000000..6a47436 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_172.json @@ -0,0 +1,293 @@ +{ + "entities": [ + { + "entity_name": "kg", + "entity_type": "DATASET_OR_CORPUS", + "description": "kg refers to a knowledge graph used to support effective reasoning in the bookrag system", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "agent based planning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "agent based planning is a mechanism assessed for its necessity in the system s performance", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "ift inspired selection mechanism", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "ift inspired selection mechanism is a strategy evaluated for its role in the system s efficiency", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "multi dimensional reasoning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "multi dimensional reasoning is a strategy validated for its effectiveness in the system", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "dynamic skyline filtering strategy", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "dynamic skyline filtering strategy is a method validated for its effectiveness in the system", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "table 7", + "entity_type": "TABLE", + "description": "table 7 is a reference in the text showing performance degradation across variants", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is the system being evaluated in the text", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "w o gradient er variant", + "entity_type": "TASK_OR_PROBLEM", + "description": "the w o gradient er variant is a specific configuration used to test the role of the knowledge graph", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "planning mechanism", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the planning mechanism is a component whose removal causes significant performance loss", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "w o selector variant", + "entity_type": "TASK_OR_PROBLEM", + "description": "the w o selector variant is a configuration used to validate the efficiency of the selection strategy", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "qasper is a dataset used to measure computational cost in tokens", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "tokens", + "entity_type": "MEASUREMENT", + "description": "tokens are the unit of measurement used to quantify computational cost", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "gradient er", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "gradient er is a specific component or technique whose removal in the w o variant highlights the role of the knowledge graph", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "selector", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the selector is a component whose removal in the w o variant validates the efficiency of the selection strategy", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "narrow then reason strategy", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the narrow then reason strategy is the specific approach inspired by ift that is being validated for efficiency", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "static workflow", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "a static workflow is described as insufficient for handling diverse types of queries contrasting with the dynamic approach", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "queries are the diverse types of tasks that the system is designed to handle", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "retrieval performance", + "entity_type": "EVALUATION_METRIC", + "description": "retrieval performance is the metric used to evaluate the impact of kg quality", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "accuracy is a metric maintained by the w o selector variant despite high computational costs", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "computational cost", + "entity_type": "MEASUREMENT", + "description": "computational cost is a metric measured in tokens to evaluate the efficiency of the variants", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "performance degradation", + "entity_type": "EVALUATION_METRIC", + "description": "performance degradation is the observed outcome across all variants confirming the essential role of each module", + "source_ids": [ + 172 + ] + }, + { + "entity_name": "performance loss", + "entity_type": "EVALUATION_METRIC", + "description": "performance loss is the significant drop observed when the planning mechanism is removed", + "source_ids": [ + 172 + ] + } + ], + "relations": [ + { + "src_entity_name": "kg", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "the kg is a critical component within the bookrag system supporting effective reasoning", + "source_ids": [ + 172 + ] + }, + { + "src_entity_name": "agent based planning", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 8.0, + "description": "agent based planning is a mechanism assessed for its necessity within the bookrag system", + "source_ids": [ + 172 + ] + }, + { + "src_entity_name": "ift inspired selection mechanism", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 8.0, + "description": "the ift inspired selection mechanism is a strategy evaluated for its efficiency in the bookrag system", + "source_ids": [ + 172 + ] + }, + { + "src_entity_name": "multi dimensional reasoning", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 8.0, + "description": "multi dimensional reasoning is a strategy validated for its effectiveness in the bookrag system", + "source_ids": [ + 172 + ] + }, + { + "src_entity_name": "dynamic skyline filtering strategy", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 8.0, + "description": "the dynamic skyline filtering strategy is a method validated for its effectiveness in the bookrag system", + "source_ids": [ + 172 + ] + }, + { + "src_entity_name": "table 7", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 7.0, + "description": "table 7 presents data regarding the performance of the bookrag system variants", + "source_ids": [ + 172 + ] + }, + { + "src_entity_name": "w o gradient er variant", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "the w o gradient er variant highlights the critical role of the kg in the system", + "source_ids": [ + 172 + ] + }, + { + "src_entity_name": "planning mechanism", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "the planning mechanism is a component of bookrag whose removal causes significant performance loss", + "source_ids": [ + 172 + ] + }, + { + "src_entity_name": "w o selector variant", + "tgt_entity_name": "ift inspired selection mechanism", + "relation_name": "", + "weight": 8.0, + "description": "the w o selector variant validates the efficiency of the ift inspired selection mechanism", + "source_ids": [ + 172 + ] + }, + { + "src_entity_name": "w o selector variant", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 7.0, + "description": "the w o selector variant incurs a computational cost measured in tokens on the qasper dataset", + "source_ids": [ + 172 + ] + }, + { + "src_entity_name": "ift inspired selection mechanism", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 7.0, + "description": "the ift inspired selection mechanism s efficiency is validated using the qasper dataset", + "source_ids": [ + 172 + ] + } + ], + "node_idx": 172 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_173.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_173.json new file mode 100644 index 0000000..1994f68 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_173.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "11", + "entity_type": "NUMBER", + "description": "11 is a number mentioned in the text though its specific context or role is not defined", + "source_ids": [ + 173 + ] + } + ], + "relations": [], + "node_idx": 173 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_174.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_174.json new file mode 100644 index 0000000..6ca9470 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_174.json @@ -0,0 +1,115 @@ +{ + "entities": [ + { + "entity_name": "figure 6", + "entity_type": "IMAGE", + "description": "figure 6 is an image comparing graph statistics with values normalized to a basic setting", + "source_ids": [ + 174 + ] + }, + { + "entity_name": "basic setting", + "entity_type": "TASK_OR_PROBLEM", + "description": "the basic setting serves as the baseline 1 0 for normalizing graph statistics values", + "source_ids": [ + 174 + ] + }, + { + "entity_name": "3 6e 3", + "entity_type": "MEASUREMENT", + "description": "3 6e 3 is an abbreviated density value representing 3 6 10 3", + "source_ids": [ + 174 + ] + }, + { + "entity_name": "graph statistics", + "entity_type": "TASK_OR_PROBLEM", + "description": "graph statistics are the subject of comparison in figure 6", + "source_ids": [ + 174 + ] + }, + { + "entity_name": "absolute values", + "entity_type": "MEASUREMENT", + "description": "absolute values for the basic setting are annotated in the text", + "source_ids": [ + 174 + ] + }, + { + "entity_name": "density values", + "entity_type": "MEASUREMENT", + "description": "density values are a specific type of metric mentioned that are abbreviated in the text", + "source_ids": [ + 174 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 6", + "tgt_entity_name": "basic setting", + "relation_name": "", + "weight": 9.0, + "description": "figure 6 compares graph statistics by normalizing values to the basic setting", + "source_ids": [ + 174 + ] + }, + { + "src_entity_name": "figure 6", + "tgt_entity_name": "3 6e 3", + "relation_name": "", + "weight": 8.0, + "description": "figure 6 contains the density value 3 6e 3 as an example of abbreviated notation", + "source_ids": [ + 174 + ] + }, + { + "src_entity_name": "figure 6", + "tgt_entity_name": "graph statistics", + "relation_name": "", + "weight": 10.0, + "description": "figure 6 is a comparison of graph statistics", + "source_ids": [ + 174 + ] + }, + { + "src_entity_name": "figure 6", + "tgt_entity_name": "absolute values", + "relation_name": "", + "weight": 8.0, + "description": "figure 6 includes annotations of absolute values for the basic setting", + "source_ids": [ + 174 + ] + }, + { + "src_entity_name": "figure 6", + "tgt_entity_name": "density values", + "relation_name": "", + "weight": 9.0, + "description": "figure 6 illustrates how density values are abbreviated using 3 6e 3 as an example", + "source_ids": [ + 174 + ] + }, + { + "src_entity_name": "basic setting", + "tgt_entity_name": "absolute values", + "relation_name": "", + "weight": 7.0, + "description": "absolute values are specifically annotated for the basic setting", + "source_ids": [ + 174 + ] + } + ], + "node_idx": 174 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_175.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_175.json new file mode 100644 index 0000000..71448ca --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_175.json @@ -0,0 +1,357 @@ +{ + "entities": [ + { + "entity_name": "cref='#/texts/224'", + "entity_type": "IMAGE", + "description": "A figure containing two bar charts comparing 'Basic' and 'Gradient-based ER' performance metrics across '# Entity', 'Density', 'Diameter', and '# CC' for MMLongBench and Qasper datasets.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "basic", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The baseline method represented by blue bars in the legend, used as a comparison point against the Gradient-based ER approach.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "gradient-based er", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The proposed or specific method represented by red bars in the legend, evaluated on various metrics against the Basic model.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "ratio", + "entity_type": "EVALUATION_METRIC", + "description": "The Y-axis label indicating the metric being measured, representing the ratio of performance between the compared methods.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "# entity", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A metric measuring the number of entities, shown as the first category of bars in both charts.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "density", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A metric measuring graph density, showing significant variation between the Basic and Gradient-based ER methods.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "diameter", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A metric measuring the longest shortest path in the graph, presented as the third category of bars.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "# cc", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A metric likely representing the number of Connected Components, shown as the fourth category of bars.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "The dataset or benchmark used for the evaluation in chart (a), located at the bottom left.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "The dataset or benchmark used for the evaluation in chart (b), located at the bottom right.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "figure (a)", + "entity_type": "SECTION_TITLE", + "description": "The label identifying the left-hand chart which displays results for the MMLongBench dataset.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "figure (b)", + "entity_type": "SECTION_TITLE", + "description": "The label identifying the right-hand chart which displays results for the Qasper dataset.", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "1327", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the '# Entity' bar for the Basic method in chart (a).", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "3.6e-3", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the 'Density' bar for the Basic method in chart (a).", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "14.8", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the 'Diameter' bar for the Basic method in chart (a).", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "169", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the '# CC' bar for the Basic method in chart (a).", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "531", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the '# Entity' bar for the Basic method in chart (b).", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "5.4e-3", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the 'Density' bar for the Basic method in chart (b).", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "15.0", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the 'Diameter' bar for the Basic method in chart (b).", + "source_ids": [ + 175 + ] + }, + { + "entity_name": "106", + "entity_type": "MEASUREMENT", + "description": "A numerical value annotation above the '# CC' bar for the Basic method in chart (b).", + "source_ids": [ + 175 + ] + } + ], + "relations": [ + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "basic", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Basic", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "gradient-based er", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Gradient-based ER", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "ratio", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Ratio", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "# entity", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to # Entity", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "density", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Density", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "diameter", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Diameter", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "# cc", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to # CC", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to MMLongBench", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Qasper", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "figure (a)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Figure (a)", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "figure (b)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to Figure (b)", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "1327", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 1327", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "3.6e-3", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 3.6E-3", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "14.8", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 14.8", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "169", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 169", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "531", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 531", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "5.4e-3", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 5.4e-3", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "15.0", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 15.0", + "source_ids": [ + 175 + ] + }, + { + "src_entity_name": "cref='#/texts/224'", + "tgt_entity_name": "106", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/224' related to 106", + "source_ids": [ + 175 + ] + } + ], + "node_idx": 175 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_176.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_176.json new file mode 100644 index 0000000..6d0e38a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_176.json @@ -0,0 +1,249 @@ +{ + "entities": [ + { + "entity_name": "gradient based entity resolution", + "entity_type": "TASK_OR_PROBLEM", + "description": "gradient based entity resolution is a method used to evaluate the quality of a constructed knowledge graph kg", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "basic kg construction", + "entity_type": "TASK_OR_PROBLEM", + "description": "basic kg construction is a standard practice using simple exact name matching for entity merging", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "figure 6", + "entity_type": "IMAGE", + "description": "figure 6 presents the comparative results of the evaluation between the two methods", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "entity count", + "entity_type": "EVALUATION_METRIC", + "description": "entity count is a metric used to measure the number of entities in the graph", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "density", + "entity_type": "EVALUATION_METRIC", + "description": "density is a metric used to measure the connectivity of the graph", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "diameter of the largest connected component", + "entity_type": "EVALUATION_METRIC", + "description": "diameter of the largest connected component is a metric measuring the longest shortest path in the largest connected part of the graph", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "number of connected components", + "entity_type": "EVALUATION_METRIC", + "description": "number of connected components is a metric counting the separate parts of the graph", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "basic baseline", + "entity_type": "BENCHMARK", + "description": "the basic baseline serves as the standard for comparison in the evaluation of the gradient based er method", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "er module", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the er module is the component responsible for identifying conceptual entities with different names", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "12", + "entity_type": "PERCENTAGE", + "description": "12 is the percentage reduction in the number of entities achieved by the gradient based er method", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "20", + "entity_type": "PERCENTAGE", + "description": "20 is the percentage increase in graph density achieved by the gradient based er method across datasets", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "many graph based methods", + "entity_type": "ORGANIZATION", + "description": "many graph based methods are a group of techniques that employ simple exact name matching for entity merging", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "datasets", + "entity_type": "DATASET_OR_CORPUS", + "description": "datasets are the collections of data used to evaluate the performance of the gradient based er method", + "source_ids": [ + 176 + ] + }, + { + "entity_name": "graph reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "graph reasoning is a task facilitated by the improved connectivity of the resulting graphs", + "source_ids": [ + 176 + ] + } + ], + "relations": [ + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "basic kg construction", + "relation_name": "", + "weight": 9.0, + "description": "gradient based entity resolution is compared against basic kg construction to evaluate quality", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "figure 6", + "relation_name": "", + "weight": 8.0, + "description": "figure 6 presents the results of the comparison involving gradient based entity resolution", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "entity count", + "relation_name": "", + "weight": 9.0, + "description": "gradient based entity resolution reduces the entity count by 12 compared to the baseline", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "density", + "relation_name": "", + "weight": 9.0, + "description": "gradient based entity resolution boosts graph density by over 20 across datasets", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "diameter of the largest connected component", + "relation_name": "", + "weight": 8.0, + "description": "gradient based entity resolution reduces the diameter of the largest connected component indicating a more compact graph", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "number of connected components", + "relation_name": "", + "weight": 8.0, + "description": "gradient based entity resolution reduces the number of connected components mitigating fragmentation", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "er module", + "relation_name": "", + "weight": 10.0, + "description": "the er module is the specific component of gradient based entity resolution that identifies entities", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "basic baseline", + "relation_name": "", + "weight": 9.0, + "description": "gradient based entity resolution is evaluated against the basic baseline to demonstrate optimization", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "12", + "relation_name": "", + "weight": 10.0, + "description": "the gradient based er method achieves a 12 reduction in entity count", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "20", + "relation_name": "", + "weight": 10.0, + "description": "the gradient based er method achieves a boost of over 20 in graph density", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "basic kg construction", + "tgt_entity_name": "many graph based methods", + "relation_name": "", + "weight": 9.0, + "description": "basic kg construction is standard practice in many graph based methods", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "datasets", + "relation_name": "", + "weight": 8.0, + "description": "the gradient based er method s performance is evaluated across multiple datasets", + "source_ids": [ + 176 + ] + }, + { + "src_entity_name": "gradient based entity resolution", + "tgt_entity_name": "graph reasoning", + "relation_name": "", + "weight": 9.0, + "description": "the structural improvements from gradient based entity resolution facilitate better graph reasoning", + "source_ids": [ + 176 + ] + } + ], + "node_idx": 176 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_177.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_177.json new file mode 100644 index 0000000..5a3a5f0 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_177.json @@ -0,0 +1,301 @@ +{ + "entities": [ + { + "entity_name": "figure 7", + "entity_type": "IMAGE", + "description": "figure 7 is an image presenting a performance breakdown of qa by different query types", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "single hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "single hop is a type of query used in the qa performance breakdown", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "multi hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi hop is a type of query used in the qa performance breakdown", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "global", + "entity_type": "TASK_OR_PROBLEM", + "description": "global is a type of query used in the qa performance breakdown", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "exact match", + "entity_type": "EVALUATION_METRIC", + "description": "exact match is an evaluation metric represented by blue bars for mmlongbench", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "accuracy is an evaluation metric represented by blue bars for qasper", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "f1 score", + "entity_type": "EVALUATION_METRIC", + "description": "f1 score is an evaluation metric represented by red bars", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "mmlongbench is a dataset for which exact match performance is measured", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "qasper is a dataset for which accuracy performance is measured", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "qa refers to the question answering task being evaluated in the figure", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "query types", + "entity_type": "TASK_OR_PROBLEM", + "description": "query types refers to the categories of queries single hop multi hop and global analyzed in the text", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "blue bars", + "entity_type": "IMAGE", + "description": "blue bars represent the visual elements in the figure corresponding to exact match and accuracy metrics", + "source_ids": [ + 177 + ] + }, + { + "entity_name": "red bars", + "entity_type": "IMAGE", + "description": "red bars represent the visual elements in the figure corresponding to the f1 score metric", + "source_ids": [ + 177 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 7", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 9.0, + "description": "figure 7 displays the performance breakdown for the single hop query type", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "multi hop", + "relation_name": "", + "weight": 9.0, + "description": "figure 7 displays the performance breakdown for the multi hop query type", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 9.0, + "description": "figure 7 displays the performance breakdown for the global query type", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "exact match", + "relation_name": "", + "weight": 8.0, + "description": "figure 7 uses exact match as a metric represented by blue bars for mmlongbench", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 8.0, + "description": "figure 7 uses accuracy as a metric represented by blue bars for qasper", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "f1 score", + "relation_name": "", + "weight": 8.0, + "description": "figure 7 uses f1 score as a metric represented by red bars", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "exact match", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "exact match is the specific metric used to evaluate performance on the mmlongbench dataset in the figure", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "accuracy", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "accuracy is the specific metric used to evaluate performance on the qasper dataset in the figure", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "multi hop", + "relation_name": "", + "weight": 5.0, + "description": "both are listed as distinct query types in the performance breakdown", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 5.0, + "description": "both are listed as distinct query types in the performance breakdown", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "multi hop", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 5.0, + "description": "both are listed as distinct query types in the performance breakdown", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "qa", + "relation_name": "", + "weight": 9.0, + "description": "figure 7 presents the performance breakdown specifically for the qa task", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 9.0, + "description": "figure 7 breaks down performance by different query types", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "blue bars", + "relation_name": "", + "weight": 8.0, + "description": "figure 7 contains blue bars to represent specific metrics", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "figure 7", + "tgt_entity_name": "red bars", + "relation_name": "", + "weight": 8.0, + "description": "figure 7 contains red bars to represent specific metrics", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "exact match", + "tgt_entity_name": "blue bars", + "relation_name": "", + "weight": 9.0, + "description": "exact match is visually represented by the blue bars in the figure", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "accuracy", + "tgt_entity_name": "blue bars", + "relation_name": "", + "weight": 9.0, + "description": "accuracy is visually represented by the blue bars in the figure", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "f1 score", + "tgt_entity_name": "red bars", + "relation_name": "", + "weight": 9.0, + "description": "f1 score is visually represented by the red bars in the figure", + "source_ids": [ + 177 + ] + }, + { + "src_entity_name": "qa", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 8.0, + "description": "the qa task performance is analyzed across different query types", + "source_ids": [ + 177 + ] + } + ], + "node_idx": 177 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_178.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_178.json new file mode 100644 index 0000000..39a94a1 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_178.json @@ -0,0 +1,159 @@ +{ + "entities": [ + { + "entity_name": "cref='#/texts/259'", + "entity_type": "IMAGE", + "description": "A figure containing two bar charts comparing EM/Accuracy and F1-score across Single, Multi, and Global configurations for MMLongBench and Qasper datasets.", + "source_ids": [ + 178 + ] + }, + { + "entity_name": "em / accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "Evaluation metric represented by blue bars in the chart, standing for Exact Match or Accuracy.", + "source_ids": [ + 178 + ] + }, + { + "entity_name": "f1-score", + "entity_type": "EVALUATION_METRIC", + "description": "Evaluation metric represented by red bars in the chart, representing the harmonic mean of precision and recall.", + "source_ids": [ + 178 + ] + }, + { + "entity_name": "score", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The Y-axis label indicating the numerical value being measured in both charts.", + "source_ids": [ + 178 + ] + }, + { + "entity_name": "single", + "entity_type": "TASK_OR_PROBLEM", + "description": "A configuration category on the X-axis representing a single-task or single-passage setting.", + "source_ids": [ + 178 + ] + }, + { + "entity_name": "multi", + "entity_type": "TASK_OR_PROBLEM", + "description": "A configuration category on the X-axis representing a multi-task or multi-passage setting.", + "source_ids": [ + 178 + ] + }, + { + "entity_name": "global", + "entity_type": "TASK_OR_PROBLEM", + "description": "A configuration category on the X-axis representing a global or holistic setting.", + "source_ids": [ + 178 + ] + }, + { + "entity_name": "(a) mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "The first dataset evaluated in the left chart, focusing on long-context benchmarks.", + "source_ids": [ + 178 + ] + }, + { + "entity_name": "(b) qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "The second dataset evaluated in the right chart, likely referring to the Question Answering in Scientific Papers with Reasoning dataset.", + "source_ids": [ + 178 + ] + } + ], + "relations": [ + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "em / accuracy", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to EM / Accuracy", + "source_ids": [ + 178 + ] + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "f1-score", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to F1-score", + "source_ids": [ + 178 + ] + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "score", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to Score", + "source_ids": [ + 178 + ] + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "single", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to Single", + "source_ids": [ + 178 + ] + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "multi", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to Multi", + "source_ids": [ + 178 + ] + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to Global", + "source_ids": [ + 178 + ] + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "(a) mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to (a) MMLongBench", + "source_ids": [ + 178 + ] + }, + { + "src_entity_name": "cref='#/texts/259'", + "tgt_entity_name": "(b) qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/259' related to (b) Qasper", + "source_ids": [ + 178 + ] + } + ], + "node_idx": 178 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_179.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_179.json new file mode 100644 index 0000000..ab198e3 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_179.json @@ -0,0 +1,225 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a system whose performance is being evaluated across different query types", + "source_ids": [ + 179 + ] + }, + { + "entity_name": "figure 7", + "entity_type": "IMAGE", + "description": "figure 7 is a visual representation that breaks down the performance of bookrag", + "source_ids": [ + 179 + ] + }, + { + "entity_name": "single hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "single hop is a type of query used to evaluate the performance of bookrag", + "source_ids": [ + 179 + ] + }, + { + "entity_name": "multihop", + "entity_type": "TASK_OR_PROBLEM", + "description": "multihop is a type of query that presents a greater challenge compared to single hop queries", + "source_ids": [ + 179 + ] + }, + { + "entity_name": "global aggregation", + "entity_type": "TASK_OR_PROBLEM", + "description": "global aggregation is a type of query used to evaluate the performance of bookrag", + "source_ids": [ + 179 + ] + }, + { + "entity_name": "agent based planning strategy", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the agent based planning strategy is a method used to handle different query types separately", + "source_ids": [ + 179 + ] + }, + { + "entity_name": "qa performance", + "entity_type": "TASK_OR_PROBLEM", + "description": "qa performance refers to the quality of answers generated which is analyzed under different query types", + "source_ids": [ + 179 + ] + }, + { + "entity_name": "query types", + "entity_type": "TASK_OR_PROBLEM", + "description": "query types are the categories of questions used to evaluate the system s performance", + "source_ids": [ + 179 + ] + }, + { + "entity_name": "disjoint pieces of evidence", + "entity_type": "DATASET_OR_CORPUS", + "description": "disjoint pieces of evidence are the fragmented information sources that make reasoning difficult", + "source_ids": [ + 179 + ] + }, + { + "entity_name": "retrieving", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "retrieving is the process of finding information identified as a challenge in the text", + "source_ids": [ + 179 + ] + }, + { + "entity_name": "reasoning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "reasoning is the cognitive process of drawing conclusions identified as a challenge in the text", + "source_ids": [ + 179 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 7", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "figure 7 displays the performance breakdown of bookrag", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s performance is evaluated against single hop queries", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "multihop", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s performance is evaluated against multihop queries which present a greater challenge", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "global aggregation", + "relation_name": "", + "weight": 9.0, + "description": "bookrag s performance is evaluated against global aggregation queries", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "multihop", + "tgt_entity_name": "agent based planning strategy", + "relation_name": "", + "weight": 8.0, + "description": "the agent based planning strategy is validated by its ability to handle multihop queries", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "agent based planning strategy", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 8.0, + "description": "the agent based planning strategy handles single hop queries separately", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "agent based planning strategy", + "tgt_entity_name": "global aggregation", + "relation_name": "", + "weight": 8.0, + "description": "the agent based planning strategy handles global aggregation queries separately", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "qa performance", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 9.0, + "description": "qa performance is measured under different query types", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "qa performance", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "qa performance is the metric used to evaluate bookrag s capabilities", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "multihop", + "tgt_entity_name": "disjoint pieces of evidence", + "relation_name": "", + "weight": 9.0, + "description": "multihop queries are challenging because they require retrieving and reasoning over disjoint pieces of evidence", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "retrieving", + "tgt_entity_name": "disjoint pieces of evidence", + "relation_name": "", + "weight": 8.0, + "description": "retrieving is the action performed on disjoint pieces of evidence", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "reasoning", + "tgt_entity_name": "disjoint pieces of evidence", + "relation_name": "", + "weight": 8.0, + "description": "reasoning is the action performed on disjoint pieces of evidence", + "source_ids": [ + 179 + ] + }, + { + "src_entity_name": "agent based planning strategy", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 9.0, + "description": "the agent based planning strategy is designed to handle different query types separately", + "source_ids": [ + 179 + ] + } + ], + "node_idx": 179 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_18.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_18.json new file mode 100644 index 0000000..2195657 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_18.json @@ -0,0 +1,319 @@ +{ + "entities": [ + { + "entity_name": "layout aware segmentation", + "entity_type": "TASK_OR_PROBLEM", + "description": "layout aware segmentation is a paradigm that parses documents into structured blocks to preserve original layout and information", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "docetl", + "entity_type": "SOFTWARE", + "description": "docetl is a state of the art method providing a declarative interface for defining llm based processing pipelines", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "llm", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "llm refers to large language models used in processing pipelines for analyzing retrieved blocks", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "multimodal retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "multimodal retrieval is a technique applied to obtain relevant content from blocks with multimodal characteristics", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "paragraphs", + "entity_type": "TASK_OR_PROBLEM", + "description": "paragraphs are structural blocks within a document preserved by layout aware segmentation", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "tables", + "entity_type": "TASK_OR_PROBLEM", + "description": "tables are structural blocks within a document preserved by layout aware segmentation", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "figures", + "entity_type": "TASK_OR_PROBLEM", + "description": "figures are structural blocks within a document preserved by layout aware segmentation", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "equations", + "entity_type": "TASK_OR_PROBLEM", + "description": "equations are structural blocks within a document preserved by layout aware segmentation", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "first paradigm", + "entity_type": "TASK_OR_PROBLEM", + "description": "the first paradigm is a method that uses fixed chunk sizes often leading to fragmented information", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "second paradigm", + "entity_type": "TASK_OR_PROBLEM", + "description": "the second paradigm refers to layout aware segmentation which preserves document structure", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "declarative interface", + "entity_type": "SOFTWARE", + "description": "the declarative interface is a feature provided by docetl that allows users to define processing pipelines", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "processing pipelines", + "entity_type": "TASK_OR_PROBLEM", + "description": "processing pipelines are sequences of operations defined by users to analyze retrieved blocks", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "llm based processing pipelines", + "entity_type": "TASK_OR_PROBLEM", + "description": "llm based processing pipelines are pipelines that utilize large language models for analysis", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "llm powered operations", + "entity_type": "TASK_OR_PROBLEM", + "description": "llm powered operations are the specific tasks combined within the processing pipelines", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "task specific optimizations", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "task specific optimizations are enhancements applied to the pipelines for specific tasks", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "fixed chunk size", + "entity_type": "MEASUREMENT", + "description": "fixed chunk size is a parameter used in the first paradigm that can cause information fragmentation", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "document native structural information", + "entity_type": "CONCEPT", + "description": "document native structural information is the data retained by layout aware segmentation", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "relevant content", + "entity_type": "CONCEPT", + "description": "relevant content is the information obtained through multimodal retrieval to answer queries", + "source_ids": [ + 18 + ] + }, + { + "entity_name": "queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "queries are the questions or requests for which relevant content is retrieved", + "source_ids": [ + 18 + ] + } + ], + "relations": [ + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "paragraphs", + "relation_name": "", + "weight": 9.0, + "description": "layout aware segmentation parses documents into paragraphs to preserve their structure", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "tables", + "relation_name": "", + "weight": 9.0, + "description": "layout aware segmentation parses documents into tables to preserve their structure", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "figures", + "relation_name": "", + "weight": 9.0, + "description": "layout aware segmentation parses documents into figures to preserve their structure", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "equations", + "relation_name": "", + "weight": 9.0, + "description": "layout aware segmentation parses documents into equations to preserve their structure", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 10.0, + "description": "docetl uses llm powered operations to create processing pipelines", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "multimodal retrieval", + "relation_name": "", + "weight": 8.0, + "description": "multimodal retrieval is a typical approach applied to blocks generated by layout aware segmentation", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "layout aware segmentation", + "relation_name": "", + "weight": 9.0, + "description": "docetl is a state of the art method within the category of layout aware segmentation", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "second paradigm", + "tgt_entity_name": "layout aware segmentation", + "relation_name": "", + "weight": 10.0, + "description": "the second paradigm is identified as layout aware segmentation in the text", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "first paradigm", + "tgt_entity_name": "fixed chunk size", + "relation_name": "", + "weight": 9.0, + "description": "the first paradigm uses a fixed chunk size which leads to fragmented information", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "declarative interface", + "relation_name": "", + "weight": 10.0, + "description": "docetl provides a declarative interface for users", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "declarative interface", + "tgt_entity_name": "processing pipelines", + "relation_name": "", + "weight": 9.0, + "description": "the declarative interface allows users to define processing pipelines", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "processing pipelines", + "tgt_entity_name": "llm powered operations", + "relation_name": "", + "weight": 9.0, + "description": "processing pipelines consist of llm powered operations", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "processing pipelines", + "tgt_entity_name": "task specific optimizations", + "relation_name": "", + "weight": 9.0, + "description": "processing pipelines include task specific optimizations", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "layout aware segmentation", + "tgt_entity_name": "document native structural information", + "relation_name": "", + "weight": 9.0, + "description": "layout aware segmentation retains document native structural information", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "multimodal retrieval", + "tgt_entity_name": "relevant content", + "relation_name": "", + "weight": 9.0, + "description": "multimodal retrieval is used to obtain relevant content", + "source_ids": [ + 18 + ] + }, + { + "src_entity_name": "multimodal retrieval", + "tgt_entity_name": "queries", + "relation_name": "", + "weight": 8.0, + "description": "multimodal retrieval is applied to answer queries", + "source_ids": [ + 18 + ] + } + ], + "node_idx": 18 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_180.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_180.json new file mode 100644 index 0000000..f7a27be --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_180.json @@ -0,0 +1,97 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a system whose performance bottlenecks are being diagnosed through error analysis", + "source_ids": [ + 180 + ] + }, + { + "entity_name": "figure 9", + "entity_type": "IMAGE", + "description": "figure 9 is a visual representation showing the error propagation traced during the analysis", + "source_ids": [ + 180 + ] + }, + { + "entity_name": "error response analysis", + "entity_type": "TASK_OR_PROBLEM", + "description": "error response analysis is the specific task conducted to diagnose performance bottlenecks", + "source_ids": [ + 180 + ] + }, + { + "entity_name": "200 sampled queries", + "entity_type": "MEASUREMENT", + "description": "200 sampled queries refers to the quantity of queries from each dataset used for the analysis", + "source_ids": [ + 180 + ] + }, + { + "entity_name": "four types", + "entity_type": "MEASUREMENT", + "description": "four types refers to the number of categories into which failures are classified", + "source_ids": [ + 180 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "figure 9", + "relation_name": "", + "weight": 9.0, + "description": "figure 9 illustrates the error propagation traced while diagnosing the performance bottlenecks of bookrag", + "source_ids": [ + 180 + ] + }, + { + "src_entity_name": "error response analysis", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "error response analysis is performed on bookrag to diagnose its performance bottlenecks", + "source_ids": [ + 180 + ] + }, + { + "src_entity_name": "error response analysis", + "tgt_entity_name": "200 sampled queries", + "relation_name": "", + "weight": 9.0, + "description": "the analysis is conducted on 200 sampled queries from each dataset", + "source_ids": [ + 180 + ] + }, + { + "src_entity_name": "error response analysis", + "tgt_entity_name": "figure 9", + "relation_name": "", + "weight": 9.0, + "description": "the analysis traces error propagation as shown in figure 9", + "source_ids": [ + 180 + ] + }, + { + "src_entity_name": "error response analysis", + "tgt_entity_name": "four types", + "relation_name": "", + "weight": 8.0, + "description": "the analysis categorizes failures into four types", + "source_ids": [ + 180 + ] + } + ], + "node_idx": 180 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_181.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_181.json new file mode 100644 index 0000000..9c9285c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_181.json @@ -0,0 +1,169 @@ +{ + "entities": [ + { + "entity_name": "figure 8", + "entity_type": "IMAGE", + "description": "figure 8 is an image presenting a case study of responses across different query types", + "source_ids": [ + 181 + ] + }, + { + "entity_name": "mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "mmlongbench is a dataset or benchmark used to generate query types in the case study", + "source_ids": [ + 181 + ] + }, + { + "entity_name": "qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "qasper is a dataset or benchmark used to generate query types in the case study", + "source_ids": [ + 181 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "SOFTWARE", + "description": "bookrag is a system or software that generated correct content highlighted in cyan text", + "source_ids": [ + 181 + ] + }, + { + "entity_name": "cyan text", + "entity_type": "COLOR", + "description": "cyan text refers to the color used to highlight correct content generated by bookrag in the figure", + "source_ids": [ + 181 + ] + }, + { + "entity_name": "gray text", + "entity_type": "COLOR", + "description": "gray text refers to the color used to describe the internal process in the figure", + "source_ids": [ + 181 + ] + }, + { + "entity_name": "case study", + "entity_type": "EVENT", + "description": "case study is the specific analysis of responses across different query types presented in the text", + "source_ids": [ + 181 + ] + }, + { + "entity_name": "query types", + "entity_type": "TASK_OR_PROBLEM", + "description": "query types are the different categories of questions used to evaluate the responses in the case study", + "source_ids": [ + 181 + ] + }, + { + "entity_name": "internal process", + "entity_type": "TASK_OR_PROBLEM", + "description": "internal process refers to the underlying mechanisms described in the gray text", + "source_ids": [ + 181 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 8", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "figure 8 presents a case study involving responses from mmlongbench", + "source_ids": [ + 181 + ] + }, + { + "src_entity_name": "figure 8", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "figure 8 presents a case study involving responses from qasper", + "source_ids": [ + 181 + ] + }, + { + "src_entity_name": "figure 8", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "figure 8 highlights content generated by bookrag", + "source_ids": [ + 181 + ] + }, + { + "src_entity_name": "cyan text", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "cyan text highlights the content generated by bookrag", + "source_ids": [ + 181 + ] + }, + { + "src_entity_name": "gray text", + "tgt_entity_name": "internal process", + "relation_name": "", + "weight": 10.0, + "description": "gray text describes the internal process", + "source_ids": [ + 181 + ] + }, + { + "src_entity_name": "case study", + "tgt_entity_name": "query types", + "relation_name": "", + "weight": 9.0, + "description": "the case study analyzes responses across different query types", + "source_ids": [ + 181 + ] + }, + { + "src_entity_name": "case study", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "the case study uses responses from mmlongbench", + "source_ids": [ + 181 + ] + }, + { + "src_entity_name": "case study", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "the case study uses responses from qasper", + "source_ids": [ + 181 + ] + }, + { + "src_entity_name": "figure 8", + "tgt_entity_name": "case study", + "relation_name": "", + "weight": 10.0, + "description": "figure 8 presents the case study", + "source_ids": [ + 181 + ] + } + ], + "node_idx": 181 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_182.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_182.json new file mode 100644 index 0000000..a58a727 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_182.json @@ -0,0 +1,447 @@ +{ + "entities": [ + { + "entity_name": "bookrag response of different query types", + "entity_type": "IMAGE", + "description": "A document illustrating BookRAG's responses to three distinct query types: Single-hop, Multi-hop, and Global Aggregation cases.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "single-hop case from qasper", + "entity_type": "SECTION_TITLE", + "description": "The title of the first section detailing a single-hop query example involving a reward model for reinforcement learning.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "The source dataset used for the single-hop and multi-hop case studies presented in the image.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "agent-based planning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A planning strategy described in the text where operators are selected to decompose or handle specific queries.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "select_by_entity operator", + "entity_type": "SOFTWARE", + "description": "An operator that identifies relevant sub-trees (e.g., Introduction, Related work) to prune the reasoning space.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "graph_reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "A reasoning step performed after the Select_by_Entity operator focuses on a specific scope.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "text_reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "A reasoning step involved in retrieving nodes for the final response.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "skyline_ranker", + "entity_type": "SOFTWARE", + "description": "An operator used to retrieve 8 nodes for the final response based on focused scope.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "binary reward system", + "entity_type": "TECHNOLOGY", + "description": "A system that evaluates the success or failure of dialog interactions with a discount factor.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "discount factor", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A variable used in the reward model calculation, specifically noted as 0.95 in the text.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "multi-hop case from qasper", + "entity_type": "SECTION_TITLE", + "description": "The title of the second section detailing a multi-hop query comparing interpretable systems and LSTM models.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "interpretable system", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "A system type compared against LSTM-ELMo, utilizing vectors and cosine distance.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "lstm with elmo system", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "A machine learning model mentioned in the comparison, achieving an accuracy of 0.6818.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "lstm-elmo net", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "Another reference to the Long Short-Term Memory network combined with ELMo embeddings.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "table 1", + "entity_type": "TABLE", + "description": "A table referenced in the text containing experimental results.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "diacritic swapping", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A method mentioned as showing remarkably poor performance in the context of the experiment.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "cross-entropy", + "entity_type": "EVALUATION_METRIC", + "description": "The loss measure used for the test results in the multi-hop query analysis.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "decompose operator", + "entity_type": "SOFTWARE", + "description": "An operator used in Agent-based Planning for multi-hop queries to break down the question.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "global aggregation case from mmlongbench", + "entity_type": "SECTION_TITLE", + "description": "The title of the third section detailing a global query about counting charts in a document.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "The benchmark or dataset used for the global aggregation case study.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "filter operators", + "entity_type": "SOFTWARE", + "description": "Operators applied to filter data based on specific criteria like page range or modality.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "filter_range", + "entity_type": "SOFTWARE", + "description": "A filter operator specifying a range of pages (e.g., '1-10') to search within.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "filter_modal", + "entity_type": "SOFTWARE", + "description": "A filter operator specifying the modality of content, such as 'image'.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "reduce", + "entity_type": "SOFTWARE", + "description": "A process step that synthesizes the final output after analyzing images.", + "source_ids": [ + 182 + ] + }, + { + "entity_name": "image cref='#/texts/282'", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 182 + ] + } + ], + "relations": [ + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "bookrag response of different query types", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to BookRAG response of different query types", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "single-hop case from qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Single-hop Case from Qasper", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Qasper", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "agent-based planning", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Agent-based Planning", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "select_by_entity operator", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Select_by_Entity operator", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "graph_reasoning", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Graph_Reasoning", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "text_reasoning", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Text_Reasoning", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "skyline_ranker", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Skyline_Ranker", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "binary reward system", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to binary reward system", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "discount factor", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to discount factor", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "multi-hop case from qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Multi-hop Case from Qasper", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "interpretable system", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Interpretable system", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "lstm with elmo system", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to LSTM with ELMo system", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "lstm-elmo net", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to LSTM-ELMo net", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "table 1", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Table 1", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "diacritic swapping", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Diacritic swapping", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "cross-entropy", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to cross-entropy", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "decompose operator", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Decompose operator", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "global aggregation case from mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Global Aggregation Case from MMLongBench", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to MMLongBench", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "filter operators", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Filter operators", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "filter_range", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Filter_Range", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "filter_modal", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Filter_Modal", + "source_ids": [ + 182 + ] + }, + { + "src_entity_name": "image cref='#/texts/282'", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/282' related to Reduce", + "source_ids": [ + 182 + ] + } + ], + "node_idx": 182 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_183.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_183.json new file mode 100644 index 0000000..5c4674b --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_183.json @@ -0,0 +1,107 @@ +{ + "entities": [ + { + "entity_name": "figure 9", + "entity_type": "IMAGE", + "description": "figure 9 is an image presenting an error analysis on sampled queries", + "source_ids": [ + 183 + ] + }, + { + "entity_name": "mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "mmlongbench is a dataset from which 200 sampled queries were taken for error analysis", + "source_ids": [ + 183 + ] + }, + { + "entity_name": "qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "qasper is a dataset from which 200 sampled queries were taken for error analysis", + "source_ids": [ + 183 + ] + }, + { + "entity_name": "200", + "entity_type": "MEASUREMENT", + "description": "200 is the number of sampled queries used in the error analysis", + "source_ids": [ + 183 + ] + }, + { + "entity_name": "error analysis", + "entity_type": "TASK_OR_PROBLEM", + "description": "error analysis is the task being performed on the sampled queries from the datasets", + "source_ids": [ + 183 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 9", + "tgt_entity_name": "mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "figure 9 presents an error analysis on queries sampled from the mmlongbench dataset", + "source_ids": [ + 183 + ] + }, + { + "src_entity_name": "figure 9", + "tgt_entity_name": "qasper", + "relation_name": "", + "weight": 9.0, + "description": "figure 9 presents an error analysis on queries sampled from the qasper dataset", + "source_ids": [ + 183 + ] + }, + { + "src_entity_name": "figure 9", + "tgt_entity_name": "error analysis", + "relation_name": "", + "weight": 10.0, + "description": "figure 9 displays the results of the error analysis", + "source_ids": [ + 183 + ] + }, + { + "src_entity_name": "error analysis", + "tgt_entity_name": "200", + "relation_name": "", + "weight": 9.0, + "description": "the error analysis was conducted on 200 sampled queries", + "source_ids": [ + 183 + ] + }, + { + "src_entity_name": "mmlongbench", + "tgt_entity_name": "200", + "relation_name": "", + "weight": 8.0, + "description": "200 sampled queries were taken from the mmlongbench dataset", + "source_ids": [ + 183 + ] + }, + { + "src_entity_name": "qasper", + "tgt_entity_name": "200", + "relation_name": "", + "weight": 8.0, + "description": "200 sampled queries were taken from the qasper dataset", + "source_ids": [ + 183 + ] + } + ], + "node_idx": 183 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_184.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_184.json new file mode 100644 index 0000000..bc23369 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_184.json @@ -0,0 +1,285 @@ +{ + "entities": [ + { + "entity_name": "cref='#/texts/348'", + "entity_type": "IMAGE", + "description": "A figure containing two funnel diagrams comparing error analysis for the MMLongBench and Qasper datasets.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "(a) mmlongbench", + "entity_type": "DATASET_OR_CORPUS", + "description": "The left diagram illustrating the breakdown of query processing results for the MMLongBench dataset.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "(b) qasper", + "entity_type": "DATASET_OR_CORPUS", + "description": "The right diagram illustrating the breakdown of query processing results for the Qasper dataset.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "all queries (200)", + "entity_type": "MEASUREMENT", + "description": "The initial total number of queries processed in both the MMLongBench and Qasper experiments.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "successful parsing (194)", + "entity_type": "MEASUREMENT", + "description": "The count of queries that were successfully parsed within the MMLongBench experiment.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "retrieval error (52)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to retrieval failures in the MMLongBench experiment.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "generation error (36)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to generation failures in the MMLongBench experiment.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "plan error (27)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to planning failures in the MMLongBench experiment.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "parsing error (6)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to parsing failures in the MMLongBench experiment.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "correct (79)", + "entity_type": "EVALUATION_METRIC", + "description": "The final count of correctly answered queries in the MMLongBench experiment.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "successful parsing (193)", + "entity_type": "MEASUREMENT", + "description": "The count of queries that were successfully parsed within the Qasper experiment.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "generation error (30)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to generation failures in the Qasper experiment.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "retrieval error (26)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to retrieval failures in the Qasper experiment.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "plan error (20)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to planning failures in the Qasper experiment.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "parsing error (7)", + "entity_type": "TASK_OR_PROBLEM", + "description": "The count of errors attributed to parsing failures in the Qasper experiment.", + "source_ids": [ + 184 + ] + }, + { + "entity_name": "correct (117)", + "entity_type": "EVALUATION_METRIC", + "description": "The final count of correctly answered queries in the Qasper experiment.", + "source_ids": [ + 184 + ] + } + ], + "relations": [ + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "(a) mmlongbench", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to (a) MMLongBench", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "(b) qasper", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to (b) Qasper", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "all queries (200)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to All Queries (200)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "successful parsing (194)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Successful Parsing (194)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "retrieval error (52)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Retrieval Error (52)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "generation error (36)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Generation Error (36)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "plan error (27)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Plan Error (27)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "parsing error (6)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Parsing Error (6)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "correct (79)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Correct (79)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "successful parsing (193)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Successful Parsing (193)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "generation error (30)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Generation Error (30)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "retrieval error (26)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Retrieval Error (26)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "plan error (20)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Plan Error (20)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "parsing error (7)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Parsing Error (7)", + "source_ids": [ + 184 + ] + }, + { + "src_entity_name": "cref='#/texts/348'", + "tgt_entity_name": "correct (117)", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/348' related to Correct (117)", + "source_ids": [ + 184 + ] + } + ], + "node_idx": 184 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_185.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_185.json new file mode 100644 index 0000000..042e0b4 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_185.json @@ -0,0 +1,331 @@ +{ + "entities": [ + { + "entity_name": "pdf parsing", + "entity_type": "TASK_OR_PROBLEM", + "description": "pdf parsing is identified as a task or problem area within the context of the study", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "plan", + "entity_type": "TASK_OR_PROBLEM", + "description": "plan refers to the planning aspect of the process where errors are analyzed", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval is a task or problem area where errors are identified as the dominant failure mode", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "generation is a task or problem area where errors are identified as the second most common failure mode", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "retrieval error", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval error is the dominant failure mode identified in the results", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "generation error", + "entity_type": "TASK_OR_PROBLEM", + "description": "generation error is the second most common failure mode identified in the results", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "plan error", + "entity_type": "TASK_OR_PROBLEM", + "description": "plan error is a specific failure pattern where the planner over decomposes queries", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "multimodal evidence", + "entity_type": "TASK_OR_PROBLEM", + "description": "multimodal evidence is the type of information that is challenging to locate and synthesize", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "single hop queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "single hop queries are detailed queries that are incorrectly decomposed by the planner", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "multi hop sub tasks", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi hop sub tasks are unnecessary tasks created by the over decomposition of single hop queries", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "disjointed retrieval paths", + "entity_type": "TASK_OR_PROBLEM", + "description": "disjointed retrieval paths are the result of fragmentation preventing cohesive synthesis", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "cohesive final answer", + "entity_type": "TASK_OR_PROBLEM", + "description": "cohesive final answer is the desired outcome that is prevented by disjointed retrieval paths", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "model", + "entity_type": "TASK_OR_PROBLEM", + "description": "the model is the entity attempting to synthesize answers from sub responses", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "planner", + "entity_type": "TASK_OR_PROBLEM", + "description": "the planner is the component that tends to over decompose queries", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "qualitative analysis", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "qualitative analysis is the method used to reveal specific failure patterns", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "results", + "entity_type": "TASK_OR_PROBLEM", + "description": "the results are the findings that identify retrieval error as the dominant failure mode", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "fragmentation", + "entity_type": "TASK_OR_PROBLEM", + "description": "fragmentation is the process leading to disjointed retrieval paths", + "source_ids": [ + 185 + ] + }, + { + "entity_name": "scattered sub responses", + "entity_type": "TASK_OR_PROBLEM", + "description": "scattered sub responses are the outputs that fail to form a cohesive answer", + "source_ids": [ + 185 + ] + } + ], + "relations": [ + { + "src_entity_name": "retrieval error", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 10.0, + "description": "retrieval error is the dominant failure mode associated with the retrieval task", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "generation error", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 10.0, + "description": "generation error is the second most common failure mode associated with the generation task", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "plan error", + "tgt_entity_name": "plan", + "relation_name": "", + "weight": 10.0, + "description": "plan error is a specific failure pattern occurring within the plan task", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "retrieval error", + "tgt_entity_name": "multimodal evidence", + "relation_name": "", + "weight": 8.0, + "description": "retrieval error reflects the challenge of locating multimodal evidence", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "generation error", + "tgt_entity_name": "multimodal evidence", + "relation_name": "", + "weight": 8.0, + "description": "generation error reflects the challenge of synthesizing multimodal evidence", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "plan error", + "tgt_entity_name": "single hop queries", + "relation_name": "", + "weight": 9.0, + "description": "plan error involves the over decomposition of single hop queries", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "plan error", + "tgt_entity_name": "multi hop sub tasks", + "relation_name": "", + "weight": 9.0, + "description": "plan error leads to the creation of unnecessary multi hop sub tasks", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "plan error", + "tgt_entity_name": "disjointed retrieval paths", + "relation_name": "", + "weight": 9.0, + "description": "plan error causes fragmentation leading to disjointed retrieval paths", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "disjointed retrieval paths", + "tgt_entity_name": "cohesive final answer", + "relation_name": "", + "weight": 9.0, + "description": "disjointed retrieval paths prevent the model from synthesizing a cohesive final answer", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "planner", + "tgt_entity_name": "plan error", + "relation_name": "", + "weight": 10.0, + "description": "the planner is the agent responsible for the plan error failure pattern", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "planner", + "tgt_entity_name": "single hop queries", + "relation_name": "", + "weight": 9.0, + "description": "the planner acts upon single hop queries by over decomposing them", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "qualitative analysis", + "tgt_entity_name": "plan error", + "relation_name": "", + "weight": 9.0, + "description": "qualitative analysis reveals the specific failure pattern of plan error", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "results", + "tgt_entity_name": "retrieval error", + "relation_name": "", + "weight": 10.0, + "description": "the results identify retrieval error as the dominant failure mode", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "results", + "tgt_entity_name": "generation error", + "relation_name": "", + "weight": 10.0, + "description": "the results identify generation error as the second most common failure mode", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "fragmentation", + "tgt_entity_name": "disjointed retrieval paths", + "relation_name": "", + "weight": 9.0, + "description": "fragmentation leads directly to disjointed retrieval paths", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "model", + "tgt_entity_name": "cohesive final answer", + "relation_name": "", + "weight": 9.0, + "description": "the model attempts to synthesize a cohesive final answer but is prevented from doing so", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "model", + "tgt_entity_name": "scattered sub responses", + "relation_name": "", + "weight": 8.0, + "description": "the model receives scattered sub responses which it fails to synthesize", + "source_ids": [ + 185 + ] + }, + { + "src_entity_name": "retrieval error", + "tgt_entity_name": "generation error", + "relation_name": "", + "weight": 7.0, + "description": "retrieval error is the dominant failure mode followed by generation error", + "source_ids": [ + 185 + ] + } + ], + "node_idx": 185 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_186.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_186.json new file mode 100644 index 0000000..454021a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_186.json @@ -0,0 +1,225 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a system that utilizes specific operators to answer queries and prune search spaces", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "figure 8", + "entity_type": "IMAGE", + "description": "figure 8 is an illustration depicting bookrag s answering workflow across different query types", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "single hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "single hop refers to a type of query case where the reasoning space is reduced from 134 to 24 nodes", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "multi hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi hop is a type of query case handled by bookrag s answering workflow", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "global queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "global queries are a type of query case processed by bookrag s answering workflow", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "select", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "select is a specific operator leveraged by bookrag to prune search spaces", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "decompose", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "decompose is a specific operator leveraged by bookrag to prune search spaces", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "filter", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "filter is a specific operator leveraged by bookrag to prune search spaces", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "134", + "entity_type": "MEASUREMENT", + "description": "134 represents the initial number of nodes in the reasoning space for the single hop case", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "24", + "entity_type": "MEASUREMENT", + "description": "24 represents the reduced number of nodes in the reasoning space for the single hop case", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "case study", + "entity_type": "TASK_OR_PROBLEM", + "description": "case study is the context or type of analysis being presented in the text", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "answering workflow", + "entity_type": "TASK_OR_PROBLEM", + "description": "answering workflow is the process illustrated by figure 8 that bookrag uses to handle queries", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "search spaces", + "entity_type": "TASK_OR_PROBLEM", + "description": "search spaces are the areas that bookrag prunes using specific operators to improve efficiency", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "relevant evidence", + "entity_type": "TASK_OR_PROBLEM", + "description": "relevant evidence is the specific information that bookrag isolates from noise", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "noise", + "entity_type": "TASK_OR_PROBLEM", + "description": "noise refers to irrelevant data from which bookrag isolates relevant evidence", + "source_ids": [ + 186 + ] + }, + { + "entity_name": "precise answer generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "precise answer generation is the outcome ensured by bookrag s ability to isolate relevant evidence", + "source_ids": [ + 186 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "figure 8", + "relation_name": "", + "weight": 10.0, + "description": "figure 8 illustrates the answering workflow of bookrag", + "source_ids": [ + 186 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 9.0, + "description": "bookrag processes single hop queries reducing the reasoning space significantly", + "source_ids": [ + 186 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "multi hop", + "relation_name": "", + "weight": 9.0, + "description": "bookrag processes multi hop queries as part of its answering workflow", + "source_ids": [ + 186 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "global queries", + "relation_name": "", + "weight": 9.0, + "description": "bookrag processes global queries as part of its answering workflow", + "source_ids": [ + 186 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "select", + "relation_name": "", + "weight": 10.0, + "description": "bookrag leverages the select operator to prune search spaces", + "source_ids": [ + 186 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "decompose", + "relation_name": "", + "weight": 10.0, + "description": "bookrag leverages the decompose operator to prune search spaces", + "source_ids": [ + 186 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "filter", + "relation_name": "", + "weight": 10.0, + "description": "bookrag leverages the filter operator to prune search spaces", + "source_ids": [ + 186 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "134", + "relation_name": "", + "weight": 8.0, + "description": "the single hop case starts with a reasoning space of 134 nodes", + "source_ids": [ + 186 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "24", + "relation_name": "", + "weight": 8.0, + "description": "the single hop case reduces the reasoning space to 24 nodes", + "source_ids": [ + 186 + ] + } + ], + "node_idx": 186 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_187.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_187.json new file mode 100644 index 0000000..85bf648 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_187.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "7 conclusion", + "entity_type": "SECTION_TITLE", + "description": "As the final substantive section of the paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section summarizes the key contributions, specifically the BookRAG framework and BookIndex structure, and highlights the state-of-the-art performance achieved in retrieval recall and QA accuracy.", + "source_ids": [ + 187 + ] + } + ], + "relations": [], + "node_idx": 187 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_188.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_188.json new file mode 100644 index 0000000..9df72b1 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_188.json @@ -0,0 +1,277 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a novel method proposed in the paper built upon book index", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "book index", + "entity_type": "PRODUCT", + "description": "book index is a document native structured tree graph index designed to capture intricate relations of structural documents", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "agent based method", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "an agent based method is employed to dynamically configure retrieval and reasoning operators", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "retrieval precision", + "entity_type": "EVALUATION_METRIC", + "description": "retrieval precision is a metric where the proposed approach demonstrates significant superiority over existing baselines", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "answer accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "answer accuracy is a metric where the proposed approach demonstrates significant superiority over existing baselines", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "benchmarks", + "entity_type": "BENCHMARK", + "description": "benchmarks are multiple tests on which the approach achieves state of the art performance", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "document native database system", + "entity_type": "PRODUCT", + "description": "a document native database system is a future exploration goal that supports data formatting knowledge extraction and intelligent querying", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "paper", + "entity_type": "PUBLICATION_VENUE", + "description": "the paper is the document in which the bookrag method is proposed", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "tree graph index", + "entity_type": "TECHNOLOGY", + "description": "the tree graph index is the specific structure of the book index document native system", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "retrieval operators", + "entity_type": "SOFTWARE", + "description": "retrieval operators are components dynamically configured by the agent based method", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "reasoning operators", + "entity_type": "SOFTWARE", + "description": "reasoning operators are components dynamically configured by the agent based method", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "existing baselines", + "entity_type": "PRODUCT", + "description": "existing baselines are the current methods that bookrag outperforms in performance", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "data formatting", + "entity_type": "TASK_OR_PROBLEM", + "description": "data formatting is a capability supported by the future document native database system", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "knowledge extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "knowledge extraction is a capability supported by the future document native database system", + "source_ids": [ + 188 + ] + }, + { + "entity_name": "intelligent querying", + "entity_type": "TASK_OR_PROBLEM", + "description": "intelligent querying is a capability supported by the future document native database system", + "source_ids": [ + 188 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "book index", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is built upon book index", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based method", + "relation_name": "", + "weight": 9.0, + "description": "bookrag employs an agent based method to configure operators", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "benchmarks", + "relation_name": "", + "weight": 9.0, + "description": "bookrag achieves state of the art performance on multiple benchmarks", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval precision", + "relation_name": "", + "weight": 8.0, + "description": "bookrag demonstrates significant superiority in retrieval precision over existing baselines", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "answer accuracy", + "relation_name": "", + "weight": 8.0, + "description": "bookrag demonstrates significant superiority in answer accuracy over existing baselines", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "agent based method", + "tgt_entity_name": "retrieval precision", + "relation_name": "", + "weight": 6.0, + "description": "the agent based method is used to configure operators that affect retrieval precision", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "agent based method", + "tgt_entity_name": "answer accuracy", + "relation_name": "", + "weight": 6.0, + "description": "the agent based method is used to configure operators that affect answer accuracy", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "paper", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is proposed within the paper", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "book index", + "tgt_entity_name": "tree graph index", + "relation_name": "", + "weight": 9.0, + "description": "book index is specifically a structured tree graph index", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "agent based method", + "tgt_entity_name": "retrieval operators", + "relation_name": "", + "weight": 9.0, + "description": "the agent based method dynamically configures retrieval operators", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "agent based method", + "tgt_entity_name": "reasoning operators", + "relation_name": "", + "weight": 9.0, + "description": "the agent based method dynamically configures reasoning operators", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "existing baselines", + "relation_name": "", + "weight": 9.0, + "description": "bookrag demonstrates significant superiority over existing baselines", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "document native database system", + "tgt_entity_name": "data formatting", + "relation_name": "", + "weight": 8.0, + "description": "the future database system supports data formatting", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "document native database system", + "tgt_entity_name": "knowledge extraction", + "relation_name": "", + "weight": 8.0, + "description": "the future database system supports knowledge extraction", + "source_ids": [ + 188 + ] + }, + { + "src_entity_name": "document native database system", + "tgt_entity_name": "intelligent querying", + "relation_name": "", + "weight": 8.0, + "description": "the future database system supports intelligent querying", + "source_ids": [ + 188 + ] + } + ], + "node_idx": 188 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_189.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_189.json new file mode 100644 index 0000000..c3a5b00 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_189.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "12", + "entity_type": "MEASUREMENT", + "description": "12 is a numerical value mentioned in the text potentially representing a count or measurement", + "source_ids": [ + 189 + ] + } + ], + "relations": [], + "node_idx": 189 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_19.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_19.json new file mode 100644 index 0000000..747f169 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_19.json @@ -0,0 +1,329 @@ +{ + "entities": [ + { + "entity_name": "l1", + "entity_type": "TASK_OR_PROBLEM", + "description": "l1 is a limitation of existing works described as the failure to capture the deep connection of document structure and semantics", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "l2", + "entity_type": "TASK_OR_PROBLEM", + "description": "l2 is a limitation of existing works described as the static nature of query workflows", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "text based approaches", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "text based approaches are methods that cannot capture the structural layout of the document", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "layout segmented methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "layout segmented methods are approaches that preserve document structure but fail to capture relationships between different blocks", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "real world qa scenarios", + "entity_type": "EVENT", + "description": "real world qa scenarios are contexts where user queries are highly heterogeneous", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "static or manually predefined workflows", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "static or manually predefined workflows are uniform strategies applied to diverse query needs", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "question decomposition", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "question decomposition is a method required for complex queries", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "document", + "entity_type": "PRODUCT", + "description": "document is the object whose structure and semantics are being analyzed in the text", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "tables", + "entity_type": "TABLE", + "description": "tables are examples of hierarchical blocks nested within a specific section of a document", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "section", + "entity_type": "SECTION_TITLE", + "description": "section is a part of a document where tables may be nested", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "user queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "user queries are inputs in real world qa scenarios that range from simple to complex", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "keyword lookups", + "entity_type": "TASK_OR_PROBLEM", + "description": "keyword lookups are simple types of user queries mentioned in the text", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "multi hop questions", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi hop questions are complex queries requiring evidence synthesis across different document parts", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "evidence", + "entity_type": "CONCEPT", + "description": "evidence refers to information scattered across different parts of a document needed for multi hop reasoning", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "hierarchical blocks", + "entity_type": "CONCEPT", + "description": "hierarchical blocks are structural elements of a document containing relationships", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "multi hop reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi hop reasoning is the capability limited by methods that cannot capture relationships between document blocks", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "overall performance", + "entity_type": "EVALUATION_METRIC", + "description": "overall performance is the metric affected by the limitations of existing methods", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "complex queries", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 19 + ] + }, + { + "entity_name": "simple queries", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 19 + ] + } + ], + "relations": [ + { + "src_entity_name": "l1", + "tgt_entity_name": "text based approaches", + "relation_name": "", + "weight": 9.0, + "description": "l1 is caused by text based approaches failing to capture document structure", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "l2", + "tgt_entity_name": "static or manually predefined workflows", + "relation_name": "", + "weight": 9.0, + "description": "l2 is caused by the application of static or manually predefined workflows to diverse query needs", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "text based approaches", + "tgt_entity_name": "l1", + "relation_name": "", + "weight": 9.0, + "description": "text based approaches suffer from the limitation l1", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "layout segmented methods", + "tgt_entity_name": "l2", + "relation_name": "", + "weight": 8.0, + "description": "layout segmented methods contribute to the limitation l2 by failing to capture relationships between blocks", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "real world qa scenarios", + "tgt_entity_name": "static or manually predefined workflows", + "relation_name": "", + "weight": 8.0, + "description": "real world qa scenarios involve diverse queries that make static or manually predefined workflows inefficient", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "complex queries", + "tgt_entity_name": "question decomposition", + "relation_name": "", + "weight": 7.0, + "description": "complex queries often require question decomposition", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "l1", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "l1 concerns the failure to capture the deep connection of document structure and semantics", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "text based approaches", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "text based approaches analyze the document but fail to capture its structural layout", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "tables", + "tgt_entity_name": "section", + "relation_name": "", + "weight": 9.0, + "description": "tables are nested within a specific section of the document", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "layout segmented methods", + "tgt_entity_name": "hierarchical blocks", + "relation_name": "", + "weight": 8.0, + "description": "layout segmented methods preserve hierarchical blocks but fail to capture relationships between them", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "layout segmented methods", + "tgt_entity_name": "multi hop reasoning", + "relation_name": "", + "weight": 8.0, + "description": "layout segmented methods limit the capability for multi hop reasoning across blocks", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "user queries", + "tgt_entity_name": "real world qa scenarios", + "relation_name": "", + "weight": 9.0, + "description": "user queries are the inputs found within real world qa scenarios", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "user queries", + "tgt_entity_name": "keyword lookups", + "relation_name": "", + "weight": 8.0, + "description": "keyword lookups are a type of user query mentioned in the text", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "user queries", + "tgt_entity_name": "multi hop questions", + "relation_name": "", + "weight": 8.0, + "description": "multi hop questions are a type of user query mentioned in the text", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "multi hop questions", + "tgt_entity_name": "evidence", + "relation_name": "", + "weight": 9.0, + "description": "multi hop questions require synthesizing evidence scattered across the document", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "static or manually predefined workflows", + "tgt_entity_name": "overall performance", + "relation_name": "", + "weight": 7.0, + "description": "applying static workflows to diverse needs affects the overall performance negatively", + "source_ids": [ + 19 + ] + }, + { + "src_entity_name": "simple queries", + "tgt_entity_name": "question decomposition", + "relation_name": "", + "weight": 7.0, + "description": "simple queries do not require question decomposition", + "source_ids": [ + 19 + ] + } + ], + "node_idx": 19 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_190.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_190.json new file mode 100644 index 0000000..64825ab --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_190.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "references", + "entity_type": "SECTION_TITLE", + "description": "As a top-level section following the main title 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section serves as the bibliography, listing all cited works and sources that support the research presented in the paper.", + "source_ids": [ + 190 + ] + } + ], + "relations": [], + "node_idx": 190 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_191.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_191.json new file mode 100644 index 0000000..0b90b1f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_191.json @@ -0,0 +1,463 @@ +{ + "entities": [ + { + "entity_name": "simran arora", + "entity_type": "PERSON", + "description": "simran arora is listed as one of the authors of the paper", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "brandon yang", + "entity_type": "PERSON", + "description": "brandon yang is listed as one of the authors of the paper", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "sabri eyuboglu", + "entity_type": "PERSON", + "description": "sabri eyuboglu is listed as one of the authors of the paper", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "avanika narayan", + "entity_type": "PERSON", + "description": "avanika narayan is listed as one of the authors of the paper", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "andrew hojel", + "entity_type": "PERSON", + "description": "andrew hojel is listed as one of the authors of the paper", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "immanuel trummer", + "entity_type": "PERSON", + "description": "immanuel trummer is listed as one of the authors of the paper", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "christopher r", + "entity_type": "PERSON", + "description": "christopher r is listed as one of the authors of the paper", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "language models", + "entity_type": "TECHNOLOGY", + "description": "language models are the technology enabling the simple systems described in the paper", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "heterogeneous data lakes", + "entity_type": "DATASET_OR_CORPUS", + "description": "heterogeneous data lakes are the type of data being structured by the systems in the paper", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "proceedings of the vldb endowment", + "entity_type": "PUBLICATION_VENUE", + "description": "proceedings of the vldb endowment is the publication venue where the paper appeared", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "17", + "entity_type": "MEASUREMENT", + "description": "17 refers to the volume number of the publication", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "2", + "entity_type": "MEASUREMENT", + "description": "2 refers to the issue number of the publication", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "simple systems", + "entity_type": "PRODUCT", + "description": "simple systems are the systems generated by language models as described in the paper title", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "structured views", + "entity_type": "PRODUCT", + "description": "structured views are the output generated for heterogeneous data lakes in the paper", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "vldb endowment", + "entity_type": "ORGANIZATION", + "description": "vldb endowment is the organization associated with the publication venue", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "2023", + "entity_type": "DATE", + "description": "2023 is the year of publication mentioned in the citation", + "source_ids": [ + 191 + ] + }, + { + "entity_name": "92 105", + "entity_type": "MEASUREMENT", + "description": "92 105 represents the page range of the article", + "source_ids": [ + 191 + ] + } + ], + "relations": [ + { + "src_entity_name": "simran arora", + "tgt_entity_name": "brandon yang", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "sabri eyuboglu", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "avanika narayan", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "andrew hojel", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "immanuel trummer", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "brandon yang", + "tgt_entity_name": "sabri eyuboglu", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "brandon yang", + "tgt_entity_name": "avanika narayan", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "brandon yang", + "tgt_entity_name": "andrew hojel", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "brandon yang", + "tgt_entity_name": "immanuel trummer", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "brandon yang", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "sabri eyuboglu", + "tgt_entity_name": "avanika narayan", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "sabri eyuboglu", + "tgt_entity_name": "andrew hojel", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "sabri eyuboglu", + "tgt_entity_name": "immanuel trummer", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "sabri eyuboglu", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "avanika narayan", + "tgt_entity_name": "andrew hojel", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "avanika narayan", + "tgt_entity_name": "immanuel trummer", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "avanika narayan", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "andrew hojel", + "tgt_entity_name": "immanuel trummer", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "andrew hojel", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "immanuel trummer", + "tgt_entity_name": "christopher r", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "2023", + "relation_name": "", + "weight": 8.0, + "description": "simran arora is an author of a paper published in 2023", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "simran arora is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "language models", + "tgt_entity_name": "heterogeneous data lakes", + "relation_name": "", + "weight": 10.0, + "description": "language models enable the generation of structured views of heterogeneous data lakes", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "language models", + "tgt_entity_name": "simple systems", + "relation_name": "", + "weight": 10.0, + "description": "language models enable the creation of simple systems", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simple systems", + "tgt_entity_name": "structured views", + "relation_name": "", + "weight": 9.0, + "description": "simple systems are used for generating structured views", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simple systems", + "tgt_entity_name": "heterogeneous data lakes", + "relation_name": "", + "weight": 9.0, + "description": "simple systems generate views of heterogeneous data lakes", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "simple systems", + "relation_name": "", + "weight": 8.0, + "description": "simran arora is an author of the paper describing simple systems", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "structured views", + "relation_name": "", + "weight": 8.0, + "description": "simran arora is an author of the paper describing structured views", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "simran arora", + "tgt_entity_name": "heterogeneous data lakes", + "relation_name": "", + "weight": 8.0, + "description": "simran arora is an author of the paper describing heterogeneous data lakes", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "vldb endowment", + "relation_name": "", + "weight": 9.0, + "description": "proceedings of the vldb endowment is published by the vldb endowment organization", + "source_ids": [ + 191 + ] + }, + { + "src_entity_name": "2023", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "the proceedings of the vldb endowment volume 17 issue 2 was published in 2023", + "source_ids": [ + 191 + ] + } + ], + "node_idx": 191 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_192.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_192.json new file mode 100644 index 0000000..b61c31c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_192.json @@ -0,0 +1,231 @@ +{ + "entities": [ + { + "entity_name": "akari asai", + "entity_type": "PERSON", + "description": "akari asai is listed as an author of the paper titled self rag", + "source_ids": [ + 192 + ] + }, + { + "entity_name": "zeqiu wu", + "entity_type": "PERSON", + "description": "zeqiu wu is listed as an author of the paper titled self rag", + "source_ids": [ + 192 + ] + }, + { + "entity_name": "yizhong wang", + "entity_type": "PERSON", + "description": "yizhong wang is listed as an author of the paper titled self rag", + "source_ids": [ + 192 + ] + }, + { + "entity_name": "self rag", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "self rag is a method for learning to retrieve generate and critique through self reflection", + "source_ids": [ + 192 + ] + }, + { + "entity_name": "international conference on learning representations", + "entity_type": "PUBLICATION_VENUE", + "description": "international conference on learning representations iclr is the venue where the paper was published", + "source_ids": [ + 192 + ] + }, + { + "entity_name": "2024", + "entity_type": "DATE", + "description": "2024 is the year the paper was published", + "source_ids": [ + 192 + ] + }, + { + "entity_name": "et al", + "entity_type": "PERSON", + "description": "et al refers to additional authors of the paper not explicitly named in the text", + "source_ids": [ + 192 + ] + }, + { + "entity_name": "iclr", + "entity_type": "PUBLICATION_VENUE", + "description": "iclr is the abbreviation for the international conference on learning representations where the paper was published", + "source_ids": [ + 192 + ] + } + ], + "relations": [ + { + "src_entity_name": "akari asai", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "akari asai is an author of the paper describing the self rag model", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "zeqiu wu is an author of the paper describing the self rag model", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "yizhong wang", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "yizhong wang is an author of the paper describing the self rag model", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "et al", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 8.0, + "description": "et al refers to co authors of the paper describing the self rag model", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "self rag", + "tgt_entity_name": "international conference on learning representations", + "relation_name": "", + "weight": 10.0, + "description": "the self rag paper was published at the international conference on learning representations", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "international conference on learning representations", + "relation_name": "", + "weight": 8.0, + "description": "akari asai s paper was published at the international conference on learning representations", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "international conference on learning representations", + "relation_name": "", + "weight": 8.0, + "description": "zeqiu wu s paper was published at the international conference on learning representations", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "yizhong wang", + "tgt_entity_name": "international conference on learning representations", + "relation_name": "", + "weight": 8.0, + "description": "yizhong wang s paper was published at the international conference on learning representations", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "et al", + "tgt_entity_name": "international conference on learning representations", + "relation_name": "", + "weight": 7.0, + "description": "the co authors referred to as et al published their paper at the international conference on learning representations", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "zeqiu wu", + "relation_name": "", + "weight": 8.0, + "description": "akari asai and zeqiu wu are co authors on the same paper", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "yizhong wang", + "relation_name": "", + "weight": 8.0, + "description": "akari asai and yizhong wang are co authors on the same paper", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "akari asai is listed alongside other authors et al on the same paper", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "yizhong wang", + "relation_name": "", + "weight": 8.0, + "description": "zeqiu wu and yizhong wang are co authors on the same paper", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "zeqiu wu is listed alongside other authors et al on the same paper", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "yizhong wang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "yizhong wang is listed alongside other authors et al on the same paper", + "source_ids": [ + 192 + ] + }, + { + "src_entity_name": "international conference on learning representations", + "tgt_entity_name": "iclr", + "relation_name": "", + "weight": 10.0, + "description": "iclr is the abbreviation used for the international conference on learning representations in the text", + "source_ids": [ + 192 + ] + } + ], + "node_idx": 192 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_193.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_193.json new file mode 100644 index 0000000..c34ca25 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_193.json @@ -0,0 +1,277 @@ +{ + "entities": [ + { + "entity_name": "akari asai", + "entity_type": "PERSON", + "description": "akari asai is one of the authors of the 2023 paper titled self rag", + "source_ids": [ + 193 + ] + }, + { + "entity_name": "zeqiu wu", + "entity_type": "PERSON", + "description": "zeqiu wu is one of the authors of the 2023 paper titled self rag", + "source_ids": [ + 193 + ] + }, + { + "entity_name": "yizhong wang", + "entity_type": "PERSON", + "description": "yizhong wang is one of the authors of the 2023 paper titled self rag", + "source_ids": [ + 193 + ] + }, + { + "entity_name": "avirup sil", + "entity_type": "PERSON", + "description": "avirup sil is one of the authors of the 2023 paper titled self rag", + "source_ids": [ + 193 + ] + }, + { + "entity_name": "hannaneh hajishirzi", + "entity_type": "PERSON", + "description": "hannaneh hajishirzi is one of the authors of the 2023 paper titled self rag", + "source_ids": [ + 193 + ] + }, + { + "entity_name": "self rag", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "self rag is a method described in the text for learning to retrieve generate and critique through self reflection", + "source_ids": [ + 193 + ] + }, + { + "entity_name": "2023", + "entity_type": "DATE", + "description": "2023 is the year the paper self rag was published", + "source_ids": [ + 193 + ] + }, + { + "entity_name": "arxiv preprint arxiv 2310 11511", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint arxiv 2310 11511 is the specific identifier and venue for the publication of the paper", + "source_ids": [ + 193 + ] + }, + { + "entity_name": "learning to retrieve generate and critique through self reflection", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "this is the specific technique described in the text that the self rag model learns to perform", + "source_ids": [ + 193 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "ORGANIZATION", + "description": "arxiv is the organization or platform hosting the preprint mentioned in the text", + "source_ids": [ + 193 + ] + } + ], + "relations": [ + { + "src_entity_name": "akari asai", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "akari asai is an author of the paper describing the self rag model", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "zeqiu wu is an author of the paper describing the self rag model", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "yizhong wang", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "yizhong wang is an author of the paper describing the self rag model", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "avirup sil", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "avirup sil is an author of the paper describing the self rag model", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "hannaneh hajishirzi", + "tgt_entity_name": "self rag", + "relation_name": "", + "weight": 9.0, + "description": "hannaneh hajishirzi is an author of the paper describing the self rag model", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "zeqiu wu", + "relation_name": "", + "weight": 8.0, + "description": "akari asai and zeqiu wu are co authors on the same paper", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "yizhong wang", + "relation_name": "", + "weight": 8.0, + "description": "akari asai and yizhong wang are co authors on the same paper", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "avirup sil", + "relation_name": "", + "weight": 8.0, + "description": "akari asai and avirup sil are co authors on the same paper", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "akari asai", + "tgt_entity_name": "hannaneh hajishirzi", + "relation_name": "", + "weight": 8.0, + "description": "akari asai and hannaneh hajishirzi are co authors on the same paper", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "yizhong wang", + "relation_name": "", + "weight": 8.0, + "description": "zeqiu wu and yizhong wang are co authors on the same paper", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "avirup sil", + "relation_name": "", + "weight": 8.0, + "description": "zeqiu wu and avirup sil are co authors on the same paper", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "zeqiu wu", + "tgt_entity_name": "hannaneh hajishirzi", + "relation_name": "", + "weight": 8.0, + "description": "zeqiu wu and hannaneh hajishirzi are co authors on the same paper", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "yizhong wang", + "tgt_entity_name": "avirup sil", + "relation_name": "", + "weight": 8.0, + "description": "yizhong wang and avirup sil are co authors on the same paper", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "yizhong wang", + "tgt_entity_name": "hannaneh hajishirzi", + "relation_name": "", + "weight": 8.0, + "description": "yizhong wang and hannaneh hajishirzi are co authors on the same paper", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "avirup sil", + "tgt_entity_name": "hannaneh hajishirzi", + "relation_name": "", + "weight": 8.0, + "description": "avirup sil and hannaneh hajishirzi are co authors on the same paper", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "self rag", + "tgt_entity_name": "2023", + "relation_name": "", + "weight": 10.0, + "description": "the self rag paper was published in the year 2023", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "self rag", + "tgt_entity_name": "arxiv preprint arxiv 2310 11511", + "relation_name": "", + "weight": 10.0, + "description": "the self rag paper is identified by the arxiv preprint number arxiv 2310 11511", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "self rag", + "tgt_entity_name": "learning to retrieve generate and critique through self reflection", + "relation_name": "", + "weight": 10.0, + "description": "self rag is the model that implements the method of learning to retrieve generate and critique through self reflection", + "source_ids": [ + 193 + ] + }, + { + "src_entity_name": "arxiv preprint arxiv 2310 11511", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the preprint is hosted by the arxiv organization", + "source_ids": [ + 193 + ] + } + ], + "node_idx": 193 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_194.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_194.json new file mode 100644 index 0000000..af95287 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_194.json @@ -0,0 +1,501 @@ +{ + "entities": [ + { + "entity_name": "shuai bai", + "entity_type": "PERSON", + "description": "shuai bai is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "keqin chen", + "entity_type": "PERSON", + "description": "keqin chen is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "xuejing liu", + "entity_type": "PERSON", + "description": "xuejing liu is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "jialin wang", + "entity_type": "PERSON", + "description": "jialin wang is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "wenbin ge", + "entity_type": "PERSON", + "description": "wenbin ge is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "sibo song", + "entity_type": "PERSON", + "description": "sibo song is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "kai dang", + "entity_type": "PERSON", + "description": "kai dang is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "peng wang", + "entity_type": "PERSON", + "description": "peng wang is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "shijie wang", + "entity_type": "PERSON", + "description": "shijie wang is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "jun tang", + "entity_type": "PERSON", + "description": "jun tang is listed as one of the authors of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "qwen2 5 vl technical report", + "entity_type": "PUBLICATION_VENUE", + "description": "qwen2 5 vl technical report is the title of the document authored by the listed individuals", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv is the preprint server where the qwen2 5 vl technical report was published", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "2025", + "entity_type": "DATE", + "description": "2025 is the year the qwen2 5 vl technical report was published", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "arxiv 2502 13923", + "entity_type": "FILE_TYPE", + "description": "arxiv 2502 13923 is the specific identifier for the preprint document", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "et al", + "entity_type": "PERSON", + "description": "et al indicates additional authors not explicitly listed in the text", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "qwen2 5 vl", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "qwen2 5 vl is the specific model or architecture discussed in the technical report", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "technical report", + "entity_type": "PUBLICATION_VENUE", + "description": "technical report describes the type of document being referenced", + "source_ids": [ + 194 + ] + }, + { + "entity_name": "preprint", + "entity_type": "FILE_TYPE", + "description": "preprint indicates the document is a preliminary version of a research paper", + "source_ids": [ + 194 + ] + } + ], + "relations": [ + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "shuai bai is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "keqin chen", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "keqin chen is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "xuejing liu", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "xuejing liu is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "jialin wang", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "jialin wang is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "wenbin ge", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "wenbin ge is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "sibo song", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "sibo song is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "kai dang", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "kai dang is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "peng wang", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "peng wang is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shijie wang", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "shijie wang is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "jun tang", + "tgt_entity_name": "qwen2 5 vl technical report", + "relation_name": "", + "weight": 10.0, + "description": "jun tang is an author of the qwen2 5 vl technical report", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "qwen2 5 vl technical report", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the qwen2 5 vl technical report was published as a preprint on arxiv", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "qwen2 5 vl technical report", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 9.0, + "description": "the qwen2 5 vl technical report was published in the year 2025", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "qwen2 5 vl technical report", + "tgt_entity_name": "arxiv 2502 13923", + "relation_name": "", + "weight": 9.0, + "description": "the qwen2 5 vl technical report is identified by the preprint number arxiv 2502 13923", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "keqin chen", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and keqin chen are co authors of the same document", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "xuejing liu", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and xuejing liu are co authors of the same document", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "jialin wang", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and jialin wang are co authors of the same document", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "wenbin ge", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and wenbin ge are co authors of the same document", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "sibo song", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and sibo song are co authors of the same document", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "kai dang", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and kai dang are co authors of the same document", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "peng wang", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and peng wang are co authors of the same document", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "shijie wang", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and shijie wang are co authors of the same document", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "jun tang", + "relation_name": "", + "weight": 8.0, + "description": "shuai bai and jun tang are co authors of the same document", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "qwen2 5 vl technical report", + "tgt_entity_name": "qwen2 5 vl", + "relation_name": "", + "weight": 10.0, + "description": "the report is about the qwen2 5 vl model architecture", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "qwen2 5 vl technical report", + "tgt_entity_name": "technical report", + "relation_name": "", + "weight": 9.0, + "description": "the document is identified as a technical report", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "arxiv", + "tgt_entity_name": "preprint", + "relation_name": "", + "weight": 8.0, + "description": "arxiv is a platform for preprints", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shuai bai", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "shuai bai is the first author listed before et al", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "keqin chen", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "keqin chen is listed as an author before et al", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "xuejing liu", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "xuejing liu is listed as an author before et al", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "jialin wang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "jialin wang is listed as an author before et al", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "wenbin ge", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "wenbin ge is listed as an author before et al", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "sibo song", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "sibo song is listed as an author before et al", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "kai dang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "kai dang is listed as an author before et al", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "peng wang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "peng wang is listed as an author before et al", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "shijie wang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "shijie wang is listed as an author before et al", + "source_ids": [ + 194 + ] + }, + { + "src_entity_name": "jun tang", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 7.0, + "description": "jun tang is listed as an author before et al", + "source_ids": [ + 194 + ] + } + ], + "node_idx": 194 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_195.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_195.json new file mode 100644 index 0000000..92a1fa1 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_195.json @@ -0,0 +1,271 @@ +{ + "entities": [ + { + "entity_name": "camille barboule", + "entity_type": "PERSON", + "description": "camille barboule is one of the authors of the 2025 survey on question answering over visually rich documents", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "benjamin piwowarski", + "entity_type": "PERSON", + "description": "benjamin piwowarski is one of the authors of the 2025 survey on question answering over visually rich documents", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "yoan chabot", + "entity_type": "PERSON", + "description": "yoan chabot is one of the authors of the 2025 survey on question answering over visually rich documents", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "entity_type": "BOOK", + "description": "this is the title of the survey paper published in 2025", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv is the preprint server where the survey paper was published", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "2025", + "entity_type": "DATE", + "description": "2025 is the year the survey paper was published", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "arxiv 2501 02235", + "entity_type": "FILE_TYPE", + "description": "arxiv 2501 02235 is the specific identifier for the preprint version of the survey", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "question answering", + "entity_type": "TASK_OR_PROBLEM", + "description": "question answering is the specific task addressed by the survey over visually rich documents", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "visually rich documents", + "entity_type": "DATASET_OR_CORPUS", + "description": "visually rich documents are the type of documents analyzed in the survey", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "methods", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "methods refers to the techniques discussed in the survey for handling visually rich documents", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "challenges", + "entity_type": "TASK_OR_PROBLEM", + "description": "challenges refers to the difficulties identified in the field of question answering over visually rich documents", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "trends", + "entity_type": "RESEARCH_FIELD", + "description": "trends refers to the current directions and future outlooks in the research area", + "source_ids": [ + 195 + ] + }, + { + "entity_name": "preprint", + "entity_type": "FILE_TYPE", + "description": "preprint indicates the document is a preliminary version of a research paper", + "source_ids": [ + 195 + ] + } + ], + "relations": [ + { + "src_entity_name": "camille barboule", + "tgt_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "relation_name": "", + "weight": 10.0, + "description": "camille barboule is an author of the survey paper", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "benjamin piwowarski", + "tgt_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "relation_name": "", + "weight": 10.0, + "description": "benjamin piwowarski is an author of the survey paper", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "yoan chabot", + "tgt_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "relation_name": "", + "weight": 10.0, + "description": "yoan chabot is an author of the survey paper", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper was published as a preprint on arxiv", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper was published in the year 2025", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "arxiv 2501 02235", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper is identified by the preprint number arxiv 2501 02235", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "camille barboule", + "tgt_entity_name": "benjamin piwowarski", + "relation_name": "", + "weight": 8.0, + "description": "camille barboule and benjamin piwowarski are co authors of the same survey paper", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "camille barboule", + "tgt_entity_name": "yoan chabot", + "relation_name": "", + "weight": 8.0, + "description": "camille barboule and yoan chabot are co authors of the same survey paper", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "benjamin piwowarski", + "tgt_entity_name": "yoan chabot", + "relation_name": "", + "weight": 8.0, + "description": "benjamin piwowarski and yoan chabot are co authors of the same survey paper", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "question answering", + "relation_name": "", + "weight": 9.0, + "description": "the survey focuses on the task of question answering", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "visually rich documents", + "relation_name": "", + "weight": 9.0, + "description": "the survey specifically addresses visually rich documents", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "methods", + "relation_name": "", + "weight": 8.0, + "description": "the survey covers various methods used in the field", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "challenges", + "relation_name": "", + "weight": 8.0, + "description": "the survey discusses the challenges present in the field", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "survey on question answering over visually rich documents methods challenges and trends", + "tgt_entity_name": "trends", + "relation_name": "", + "weight": 8.0, + "description": "the survey outlines the trends in the research area", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "arxiv 2501 02235", + "tgt_entity_name": "preprint", + "relation_name": "", + "weight": 9.0, + "description": "arxiv 2501 02235 is identified as a preprint document", + "source_ids": [ + 195 + ] + }, + { + "src_entity_name": "question answering", + "tgt_entity_name": "visually rich documents", + "relation_name": "", + "weight": 8.0, + "description": "question answering is performed over visually rich documents in the context of the survey", + "source_ids": [ + 195 + ] + } + ], + "node_idx": 195 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_196.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_196.json new file mode 100644 index 0000000..69aca28 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_196.json @@ -0,0 +1,533 @@ +{ + "entities": [ + { + "entity_name": "yukun cao", + "entity_type": "PERSON", + "description": "yukun cao is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "zengyi gao", + "entity_type": "PERSON", + "description": "zengyi gao is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "zhiyang li", + "entity_type": "PERSON", + "description": "zhiyang li is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "xike xie", + "entity_type": "PERSON", + "description": "xike xie is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "s kevin zhou", + "entity_type": "PERSON", + "description": "s kevin zhou is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "jianliang xu", + "entity_type": "PERSON", + "description": "jianliang xu is listed as one of the authors of the paper titled lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "lego graphrag", + "entity_type": "PRODUCT", + "description": "lego graphrag is a modularized graph based retrieval augmented generation system designed for design space exploration", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "proc vldb endow", + "entity_type": "PUBLICATION_VENUE", + "description": "proc vldb endow is the publication venue where the paper was published", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "june 2025", + "entity_type": "DATE", + "description": "june 2025 is the specific date of publication for the paper", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "2025", + "entity_type": "DATE", + "description": "2025 is the year associated with the publication and the authors work", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "3269 3283", + "entity_type": "MEASUREMENT", + "description": "3269 3283 represents the page range of the article in the publication", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "10", + "entity_type": "MEASUREMENT", + "description": "10 is the issue number of the publication volume", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "18", + "entity_type": "MEASUREMENT", + "description": "18 is the volume number of the publication", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "design space exploration", + "entity_type": "TASK_OR_PROBLEM", + "description": "design space exploration is the specific problem domain that the lego graphrag system is designed to address", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "graph based retrieval augmented generation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "graph based retrieval augmented generation is the underlying technique being modularized in the paper", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "modularizing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "modularizing is the specific method or approach applied to the graph based retrieval augmented generation system", + "source_ids": [ + 196 + ] + }, + { + "entity_name": "https doi org 10 14778 3748191 3748194", + "entity_type": "URL", + "description": "https doi org 10 14778 3748191 3748194 is the digital object identifier link for the paper", + "source_ids": [ + 196 + ] + } + ], + "relations": [ + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "yukun cao is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "zengyi gao is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "zhiyang li", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "zhiyang li is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "xike xie", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "xike xie is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "s kevin zhou", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "s kevin zhou is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "jianliang xu", + "tgt_entity_name": "lego graphrag", + "relation_name": "", + "weight": 10.0, + "description": "jianliang xu is an author of the paper describing lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "yukun cao is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "zengyi gao is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "zhiyang li", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "zhiyang li is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "xike xie", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "xike xie is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "s kevin zhou", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "s kevin zhou is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "jianliang xu", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 9.0, + "description": "jianliang xu is an author of a paper published in proc vldb endow", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "zengyi gao", + "relation_name": "", + "weight": 8.0, + "description": "yukun cao and zengyi gao are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "zhiyang li", + "relation_name": "", + "weight": 8.0, + "description": "yukun cao and zhiyang li are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "xike xie", + "relation_name": "", + "weight": 8.0, + "description": "yukun cao and xike xie are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "s kevin zhou", + "relation_name": "", + "weight": 8.0, + "description": "yukun cao and s kevin zhou are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "yukun cao", + "tgt_entity_name": "jianliang xu", + "relation_name": "", + "weight": 8.0, + "description": "yukun cao and jianliang xu are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "zhiyang li", + "relation_name": "", + "weight": 8.0, + "description": "zengyi gao and zhiyang li are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "xike xie", + "relation_name": "", + "weight": 8.0, + "description": "zengyi gao and xike xie are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "s kevin zhou", + "relation_name": "", + "weight": 8.0, + "description": "zengyi gao and s kevin zhou are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "zengyi gao", + "tgt_entity_name": "jianliang xu", + "relation_name": "", + "weight": 8.0, + "description": "zengyi gao and jianliang xu are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "zhiyang li", + "tgt_entity_name": "xike xie", + "relation_name": "", + "weight": 8.0, + "description": "zhiyang li and xike xie are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "zhiyang li", + "tgt_entity_name": "s kevin zhou", + "relation_name": "", + "weight": 8.0, + "description": "zhiyang li and s kevin zhou are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "zhiyang li", + "tgt_entity_name": "jianliang xu", + "relation_name": "", + "weight": 8.0, + "description": "zhiyang li and jianliang xu are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "xike xie", + "tgt_entity_name": "s kevin zhou", + "relation_name": "", + "weight": 8.0, + "description": "xike xie and s kevin zhou are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "xike xie", + "tgt_entity_name": "jianliang xu", + "relation_name": "", + "weight": 8.0, + "description": "xike xie and jianliang xu are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "s kevin zhou", + "tgt_entity_name": "jianliang xu", + "relation_name": "", + "weight": 8.0, + "description": "s kevin zhou and jianliang xu are co authors on the same paper", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "proc vldb endow", + "relation_name": "", + "weight": 10.0, + "description": "lego graphrag is the subject of a paper published in proc vldb endow", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "june 2025", + "relation_name": "", + "weight": 9.0, + "description": "lego graphrag was published in june 2025", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 9.0, + "description": "lego graphrag was published in the year 2025", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "proc vldb endow", + "tgt_entity_name": "june 2025", + "relation_name": "", + "weight": 9.0, + "description": "proc vldb endow published the paper in june 2025", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "proc vldb endow", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 9.0, + "description": "proc vldb endow published the paper in 2025", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "proc vldb endow", + "tgt_entity_name": "18", + "relation_name": "", + "weight": 8.0, + "description": "the paper was published in volume 18 of proc vldb endow", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "proc vldb endow", + "tgt_entity_name": "10", + "relation_name": "", + "weight": 8.0, + "description": "the paper was published in issue 10 of proc vldb endow", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "proc vldb endow", + "tgt_entity_name": "3269 3283", + "relation_name": "", + "weight": 8.0, + "description": "the paper appears on pages 3269 3283 of proc vldb endow", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "design space exploration", + "relation_name": "", + "weight": 10.0, + "description": "lego graphrag is developed specifically for design space exploration", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "graph based retrieval augmented generation", + "relation_name": "", + "weight": 10.0, + "description": "lego graphrag is a modularized version of graph based retrieval augmented generation", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "modularizing", + "relation_name": "", + "weight": 9.0, + "description": "the paper describes the process of modularizing graph based retrieval augmented generation to create lego graphrag", + "source_ids": [ + 196 + ] + }, + { + "src_entity_name": "lego graphrag", + "tgt_entity_name": "https doi org 10 14778 3748191 3748194", + "relation_name": "", + "weight": 10.0, + "description": "the paper describing lego graphrag is accessible via the provided doi link", + "source_ids": [ + 196 + ] + } + ], + "node_idx": 196 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_197.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_197.json new file mode 100644 index 0000000..e3b26ae --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_197.json @@ -0,0 +1,467 @@ +{ + "entities": [ + { + "entity_name": "chengliang chai", + "entity_type": "PERSON", + "description": "chengliang chai is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "jiajun li", + "entity_type": "PERSON", + "description": "jiajun li is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "yuhao deng", + "entity_type": "PERSON", + "description": "yuhao deng is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "yuanhao zhong", + "entity_type": "PERSON", + "description": "yuanhao zhong is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "ye yuan", + "entity_type": "PERSON", + "description": "ye yuan is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "guoren wang", + "entity_type": "PERSON", + "description": "guoren wang is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "lei cao", + "entity_type": "PERSON", + "description": "lei cao is an author of the paper titled doctopus budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "doctopus", + "entity_type": "PRODUCT", + "description": "doctopus is a system or method for budget aware structural table extraction from unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "proceedings of the vldb endowment", + "entity_type": "PUBLICATION_VENUE", + "description": "proceedings of the vldb endowment is the publication venue where the paper was published in 2025", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "2025", + "entity_type": "DATE", + "description": "2025 is the year the paper was published and the year associated with the volume and issue number", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "budget aware structural table extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "budget aware structural table extraction is the specific task addressed by the doctopus system described in the text", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "unstructured documents", + "entity_type": "DATASET_OR_CORPUS", + "description": "unstructured documents are the source material from which structural tables are extracted in the described work", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "18", + "entity_type": "MEASUREMENT", + "description": "18 is the volume number of the proceedings of the vldb endowment where the paper was published", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "11", + "entity_type": "MEASUREMENT", + "description": "11 is the issue number of the proceedings of the vldb endowment where the paper was published", + "source_ids": [ + 197 + ] + }, + { + "entity_name": "3695 3707", + "entity_type": "MEASUREMENT", + "description": "3695 3707 represents the page range of the paper within the publication", + "source_ids": [ + 197 + ] + } + ], + "relations": [ + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "chengliang chai is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "jiajun li", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "jiajun li is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "yuhao deng", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "yuhao deng is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "yuanhao zhong", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "yuanhao zhong is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "ye yuan", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "ye yuan is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "guoren wang", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "guoren wang is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "lei cao", + "tgt_entity_name": "doctopus", + "relation_name": "", + "weight": 9.0, + "description": "lei cao is an author of the paper describing the doctopus system", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "chengliang chai is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "jiajun li", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "jiajun li is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "yuhao deng", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "yuhao deng is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "yuanhao zhong", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "yuanhao zhong is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "ye yuan", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "ye yuan is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "guoren wang", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "guoren wang is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "lei cao", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 8.0, + "description": "lei cao is an author of a paper published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "doctopus", + "tgt_entity_name": "proceedings of the vldb endowment", + "relation_name": "", + "weight": 10.0, + "description": "the doctopus paper was published in the proceedings of the vldb endowment", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "doctopus", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 10.0, + "description": "doctopus is the system designed to perform budget aware structural table extraction", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "doctopus", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 10.0, + "description": "doctopus processes unstructured documents to extract structural tables", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "18", + "relation_name": "", + "weight": 9.0, + "description": "the proceedings of the vldb endowment volume 18 contains the paper", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "11", + "relation_name": "", + "weight": 9.0, + "description": "the proceedings of the vldb endowment issue 11 contains the paper", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "proceedings of the vldb endowment", + "tgt_entity_name": "3695 3707", + "relation_name": "", + "weight": 9.0, + "description": "the paper appears on pages 3695 3707 of the proceedings of the vldb endowment", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "chengliang chai is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "jiajun li", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "jiajun li is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "yuhao deng", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "yuhao deng is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "yuanhao zhong", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "yuanhao zhong is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "ye yuan", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "ye yuan is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "guoren wang", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "guoren wang is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "lei cao", + "tgt_entity_name": "budget aware structural table extraction", + "relation_name": "", + "weight": 8.0, + "description": "lei cao is an author of the work on budget aware structural table extraction", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "chengliang chai is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "jiajun li", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "jiajun li is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "yuhao deng", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "yuhao deng is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "yuanhao zhong", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "yuanhao zhong is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "ye yuan", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "ye yuan is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "guoren wang", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "guoren wang is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ] + }, + { + "src_entity_name": "lei cao", + "tgt_entity_name": "unstructured documents", + "relation_name": "", + "weight": 8.0, + "description": "lei cao is an author of the work involving unstructured documents", + "source_ids": [ + 197 + ] + } + ], + "node_idx": 197 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_198.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_198.json new file mode 100644 index 0000000..aba4d69 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_198.json @@ -0,0 +1,277 @@ +{ + "entities": [ + { + "entity_name": "ilias chalkidis", + "entity_type": "PERSON", + "description": "ilias chalkidis is one of the authors of the 2020 arxiv preprint titled legal bert", + "source_ids": [ + 198 + ] + }, + { + "entity_name": "manos fergadiotis", + "entity_type": "PERSON", + "description": "manos fergadiotis is one of the authors of the 2020 arxiv preprint titled legal bert", + "source_ids": [ + 198 + ] + }, + { + "entity_name": "prodromos malakasiotis", + "entity_type": "PERSON", + "description": "prodromos malakasiotis is one of the authors of the 2020 arxiv preprint titled legal bert", + "source_ids": [ + 198 + ] + }, + { + "entity_name": "nikolaos aletras", + "entity_type": "PERSON", + "description": "nikolaos aletras is one of the authors of the 2020 arxiv preprint titled legal bert", + "source_ids": [ + 198 + ] + }, + { + "entity_name": "ion androutsopoulos", + "entity_type": "PERSON", + "description": "ion androutsopoulos is one of the authors of the 2020 arxiv preprint titled legal bert", + "source_ids": [ + 198 + ] + }, + { + "entity_name": "legal bert", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "legal bert is a model described as the muppets straight out of law school in the text", + "source_ids": [ + 198 + ] + }, + { + "entity_name": "arxiv preprint arxiv 2010 02559", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint arxiv 2010 02559 is the specific publication venue and identifier for the paper", + "source_ids": [ + 198 + ] + }, + { + "entity_name": "2020", + "entity_type": "DATE", + "description": "2020 is the year the paper was published", + "source_ids": [ + 198 + ] + }, + { + "entity_name": "muppets", + "entity_type": "PRODUCT", + "description": "muppets is a metaphorical term used in the text to describe the legal bert model", + "source_ids": [ + 198 + ] + }, + { + "entity_name": "law school", + "entity_type": "LOCATION", + "description": "law school is a location mentioned metaphorically to indicate the origin or training context of the legal bert model", + "source_ids": [ + 198 + ] + } + ], + "relations": [ + { + "src_entity_name": "ilias chalkidis", + "tgt_entity_name": "legal bert", + "relation_name": "", + "weight": 9.0, + "description": "ilias chalkidis is an author of the paper introducing the legal bert model", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "manos fergadiotis", + "tgt_entity_name": "legal bert", + "relation_name": "", + "weight": 9.0, + "description": "manos fergadiotis is an author of the paper introducing the legal bert model", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "prodromos malakasiotis", + "tgt_entity_name": "legal bert", + "relation_name": "", + "weight": 9.0, + "description": "prodromos malakasiotis is an author of the paper introducing the legal bert model", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "nikolaos aletras", + "tgt_entity_name": "legal bert", + "relation_name": "", + "weight": 9.0, + "description": "nikolaos aletras is an author of the paper introducing the legal bert model", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "ion androutsopoulos", + "tgt_entity_name": "legal bert", + "relation_name": "", + "weight": 9.0, + "description": "ion androutsopoulos is an author of the paper introducing the legal bert model", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "legal bert", + "tgt_entity_name": "arxiv preprint arxiv 2010 02559", + "relation_name": "", + "weight": 10.0, + "description": "legal bert is the subject of the publication arxiv preprint arxiv 2010 02559", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "legal bert", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "legal bert was published in the year 2020", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "ilias chalkidis", + "tgt_entity_name": "manos fergadiotis", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "ilias chalkidis", + "tgt_entity_name": "prodromos malakasiotis", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "ilias chalkidis", + "tgt_entity_name": "nikolaos aletras", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "ilias chalkidis", + "tgt_entity_name": "ion androutsopoulos", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "manos fergadiotis", + "tgt_entity_name": "prodromos malakasiotis", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "manos fergadiotis", + "tgt_entity_name": "nikolaos aletras", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "manos fergadiotis", + "tgt_entity_name": "ion androutsopoulos", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "prodromos malakasiotis", + "tgt_entity_name": "nikolaos aletras", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "prodromos malakasiotis", + "tgt_entity_name": "ion androutsopoulos", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "nikolaos aletras", + "tgt_entity_name": "ion androutsopoulos", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "legal bert", + "tgt_entity_name": "muppets", + "relation_name": "", + "weight": 9.0, + "description": "legal bert is described as being straight out of the muppets in the text", + "source_ids": [ + 198 + ] + }, + { + "src_entity_name": "legal bert", + "tgt_entity_name": "law school", + "relation_name": "", + "weight": 9.0, + "description": "legal bert is described as coming straight out of law school in the text", + "source_ids": [ + 198 + ] + } + ], + "node_idx": 198 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_199.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_199.json new file mode 100644 index 0000000..caa8795 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_199.json @@ -0,0 +1,701 @@ +{ + "entities": [ + { + "entity_name": "sibei chen", + "entity_type": "PERSON", + "description": "sibei chen is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "yeye he", + "entity_type": "PERSON", + "description": "yeye he is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "weiwei cui", + "entity_type": "PERSON", + "description": "weiwei cui is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "ju fan", + "entity_type": "PERSON", + "description": "ju fan is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "song ge", + "entity_type": "PERSON", + "description": "song ge is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "haidong zhang", + "entity_type": "PERSON", + "description": "haidong zhang is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "dongmei zhang", + "entity_type": "PERSON", + "description": "dongmei zhang is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "surajit chaudhuri", + "entity_type": "PERSON", + "description": "surajit chaudhuri is listed as an author of the paper titled auto formula", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "auto formula", + "entity_type": "PRODUCT", + "description": "auto formula is a system or method recommended in the paper for recommending formulas in spreadsheets using contrastive learning", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "proceedings of the acm on management of data", + "entity_type": "PUBLICATION_VENUE", + "description": "proceedings of the acm on management of data is the venue where the paper was published in 2024", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "2024", + "entity_type": "DATE", + "description": "2024 is the year the paper was published", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "1 27", + "entity_type": "MEASUREMENT", + "description": "1 27 represents the page range of the article in the publication", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "table representations", + "entity_type": "DATASET_OR_CORPUS", + "description": "table representations is the subject of the contrastive learning method used in the paper", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "contrastive learning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "contrastive learning is the technique used to recommend formulas in spreadsheets", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "spreadsheets", + "entity_type": "PRODUCT", + "description": "spreadsheets are the application domain where the auto formula system recommends formulas", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "formulas", + "entity_type": "PRODUCT", + "description": "formulas are the specific items being recommended by the auto formula system", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "2", + "entity_type": "MEASUREMENT", + "description": "2 is the volume number of the publication", + "source_ids": [ + 199 + ] + }, + { + "entity_name": "3", + "entity_type": "MEASUREMENT", + "description": "3 is the issue number of the publication", + "source_ids": [ + 199 + ] + } + ], + "relations": [ + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "sibei chen is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "yeye he is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "weiwei cui is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "ju fan is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "song ge", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "song ge is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "haidong zhang", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "haidong zhang is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "dongmei zhang", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "dongmei zhang is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "surajit chaudhuri", + "tgt_entity_name": "auto formula", + "relation_name": "", + "weight": 9.0, + "description": "surajit chaudhuri is an author of the paper describing the auto formula system", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen is an author of a paper published in this venue", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "yeye he is an author of a paper published in this venue", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui is an author of a paper published in this venue", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "ju fan is an author of a paper published in this venue", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "song ge", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "song ge is an author of a paper published in this venue", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "haidong zhang", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "haidong zhang is an author of a paper published in this venue", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "dongmei zhang", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "dongmei zhang is an author of a paper published in this venue", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "surajit chaudhuri", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 8.0, + "description": "surajit chaudhuri is an author of a paper published in this venue", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 9.0, + "description": "auto formula is the subject of a paper published in this venue", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "the paper about auto formula was published in 2024", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "yeye he", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and yeye he are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "weiwei cui", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and weiwei cui are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "ju fan", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and ju fan are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "song ge", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and song ge are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "haidong zhang", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and haidong zhang are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "weiwei cui", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and weiwei cui are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "ju fan", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and ju fan are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "song ge", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and song ge are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "haidong zhang", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and haidong zhang are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "yeye he", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "yeye he and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "ju fan", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui and ju fan are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "song ge", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui and song ge are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "haidong zhang", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui and haidong zhang are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "weiwei cui", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "weiwei cui and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "song ge", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and song ge are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "haidong zhang", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and haidong zhang are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "song ge", + "tgt_entity_name": "haidong zhang", + "relation_name": "", + "weight": 8.0, + "description": "song ge and haidong zhang are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "song ge", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "song ge and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "song ge", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "song ge and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "haidong zhang", + "tgt_entity_name": "dongmei zhang", + "relation_name": "", + "weight": 8.0, + "description": "haidong zhang and dongmei zhang are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "haidong zhang", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "haidong zhang and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "dongmei zhang", + "tgt_entity_name": "surajit chaudhuri", + "relation_name": "", + "weight": 8.0, + "description": "dongmei zhang and surajit chaudhuri are co authors on the same paper", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "contrastive learning", + "relation_name": "", + "weight": 10.0, + "description": "auto formula uses contrastive learning as its core method", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "spreadsheets", + "relation_name": "", + "weight": 9.0, + "description": "auto formula operates within the context of spreadsheets", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "formulas", + "relation_name": "", + "weight": 10.0, + "description": "auto formula is designed to recommend formulas", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "auto formula", + "tgt_entity_name": "table representations", + "relation_name": "", + "weight": 9.0, + "description": "auto formula relies on table representations for its learning process", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "proceedings of the acm on management of data", + "tgt_entity_name": "2", + "relation_name": "", + "weight": 10.0, + "description": "the publication volume is 2", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "proceedings of the acm on management of data", + "tgt_entity_name": "3", + "relation_name": "", + "weight": 10.0, + "description": "the publication issue is 3", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "proceedings of the acm on management of data", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 10.0, + "description": "the publication year is 2024", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "proceedings of the acm on management of data", + "tgt_entity_name": "1 27", + "relation_name": "", + "weight": 10.0, + "description": "the publication page range is 1 27", + "source_ids": [ + 199 + ] + }, + { + "src_entity_name": "contrastive learning", + "tgt_entity_name": "table representations", + "relation_name": "", + "weight": 9.0, + "description": "contrastive learning is applied to table representations", + "source_ids": [ + 199 + ] + } + ], + "node_idx": 199 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_2.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_2.json new file mode 100644 index 0000000..ad463fe --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_2.json @@ -0,0 +1,411 @@ +{ + "entities": [ + { + "entity_name": "shu wang", + "entity_type": "PERSON", + "description": "shu wang is an author of the paper affiliated with the chinese university of hong kong shenzhen", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "yingli zhou", + "entity_type": "PERSON", + "description": "yingli zhou is an author of the paper affiliated with the chinese university of hong kong shenzhen", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "yixiang fang", + "entity_type": "PERSON", + "description": "yixiang fang is an author of the paper affiliated with the chinese university of hong kong shenzhen", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "the chinese university of hong kong shenzhen", + "entity_type": "ORGANIZATION", + "description": "the chinese university of hong kong shenzhen is the institution where the authors are affiliated", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "large language models", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "large language models are the models whose performance is being boosted by the proposed method", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "retrievalaugmented generation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "retrievalaugmented generation is a method that queries external documents to boost llm performance", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "bookrag is a novel rag approach targeted for documents with hierarchical structures", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "SOFTWARE", + "description": "bookindex is a novel index structure built by extracting a hierarchical tree from documents", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "information foraging theory", + "entity_type": "SCIENTIFIC_THEORY", + "description": "information foraging theory is the theory inspiring the agent based query method", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "question answering", + "entity_type": "TASK_OR_PROBLEM", + "description": "question answering is the task where the proposed methods aim to improve performance", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "books", + "entity_type": "BOOK", + "description": "books are examples of real world documents with hierarchical structures", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "booklets", + "entity_type": "BOOK", + "description": "booklets are examples of real world documents with hierarchical structures", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "handbooks", + "entity_type": "BOOK", + "description": "handbooks are examples of real world documents with hierarchical structures", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "three widely adopted benchmarks", + "entity_type": "BENCHMARK", + "description": "three widely adopted benchmarks were used to demonstrate the performance of bookrag", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "industry", + "entity_type": "ORGANIZATION", + "description": "industry is a sector that has attracted attention to retrievalaugmented generation", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "academia", + "entity_type": "ORGANIZATION", + "description": "academia is a sector that has attracted attention to retrievalaugmented generation", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "graph", + "entity_type": "SOFTWARE", + "description": "a graph is used to capture intricate relationships between entities in the bookindex", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "tree", + "entity_type": "SOFTWARE", + "description": "a hierarchical tree is extracted from documents to serve as the role of a table of contents", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "table of contents", + "entity_type": "SOFTWARE", + "description": "the table of contents is the role served by the hierarchical tree in the bookindex", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "retrieval recall", + "entity_type": "EVALUATION_METRIC", + "description": "retrieval recall is a metric where bookrag significantly outperforms baselines", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "qa accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "qa accuracy is a metric where bookrag significantly outperforms baselines", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "efficiency", + "entity_type": "EVALUATION_METRIC", + "description": "efficiency is a metric where bookrag maintains competitive performance", + "source_ids": [ + 2 + ] + }, + { + "entity_name": "baselines", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "baselines are existing methods that bookrag outperforms in retrieval recall and qa accuracy", + "source_ids": [ + 2 + ] + } + ], + "relations": [ + { + "src_entity_name": "shu wang", + "tgt_entity_name": "the chinese university of hong kong shenzhen", + "relation_name": "", + "weight": 10.0, + "description": "shu wang is affiliated with the chinese university of hong kong shenzhen", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "yingli zhou", + "tgt_entity_name": "the chinese university of hong kong shenzhen", + "relation_name": "", + "weight": 10.0, + "description": "yingli zhou is affiliated with the chinese university of hong kong shenzhen", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "yixiang fang", + "tgt_entity_name": "the chinese university of hong kong shenzhen", + "relation_name": "", + "weight": 10.0, + "description": "yixiang fang is affiliated with the chinese university of hong kong shenzhen", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "retrievalaugmented generation", + "tgt_entity_name": "large language models", + "relation_name": "", + "weight": 9.0, + "description": "retrievalaugmented generation is used to boost the performance of large language models", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrievalaugmented generation", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is a novel approach within the category of retrievalaugmented generation", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "question answering", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is designed to improve performance on the question answering task", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "books", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is specifically targeted for documents like books that have hierarchical structures", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes the bookindex structure to exploit logical hierarchies and trace entity relations", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "books", + "relation_name": "", + "weight": 8.0, + "description": "bookindex is built by extracting a hierarchical tree from documents such as books", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "information foraging theory", + "relation_name": "", + "weight": 8.0, + "description": "the agent based query method in bookrag is inspired by information foraging theory", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "three widely adopted benchmarks", + "relation_name": "", + "weight": 9.0, + "description": "bookrag was evaluated and demonstrated state of the art performance on three widely adopted benchmarks", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "retrievalaugmented generation", + "tgt_entity_name": "industry", + "relation_name": "", + "weight": 7.0, + "description": "industry has attracted attention to retrievalaugmented generation", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "retrievalaugmented generation", + "tgt_entity_name": "academia", + "relation_name": "", + "weight": 7.0, + "description": "academia has attracted attention to retrievalaugmented generation", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "booklets", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is specifically targeted for documents like booklets that have hierarchical structures", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "handbooks", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is specifically targeted for documents like handbooks that have hierarchical structures", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "tree", + "relation_name": "", + "weight": 9.0, + "description": "bookindex is built by extracting a hierarchical tree from the document", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "graph", + "relation_name": "", + "weight": 8.0, + "description": "bookindex uses a graph to capture the intricate relationships between entities", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "table of contents", + "relation_name": "", + "weight": 8.0, + "description": "the hierarchical tree serves as the role of the table of contents", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "bookrag significantly outperforms baselines in retrieval recall", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "qa accuracy", + "relation_name": "", + "weight": 9.0, + "description": "bookrag significantly outperforms baselines in qa accuracy", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "efficiency", + "relation_name": "", + "weight": 7.0, + "description": "bookrag maintains competitive efficiency", + "source_ids": [ + 2 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "baselines", + "relation_name": "", + "weight": 9.0, + "description": "bookrag significantly outperforms baselines in both retrieval recall and qa accuracy", + "source_ids": [ + 2 + ] + } + ], + "node_idx": 2 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_20.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_20.json new file mode 100644 index 0000000..4be4f4d --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_20.json @@ -0,0 +1,197 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "TECHNOLOGY", + "description": "bookrag is a retrieval augmented generation method introduced to bridge a gap in document qa tasks", + "source_ids": [ + 20 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "bookindex is a document native structure used by bookrag to organize information through hierarchical and graph based methods", + "source_ids": [ + 20 + ] + }, + { + "entity_name": "document qa tasks", + "entity_type": "TASK_OR_PROBLEM", + "description": "document qa tasks are the specific problems that bookrag and bookindex are designed to address", + "source_ids": [ + 20 + ] + }, + { + "entity_name": "hierarchical tree structure", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the hierarchical tree structure is a method used to preserve the document s native logical hierarchy by organizing parsed content blocks", + "source_ids": [ + 20 + ] + }, + { + "entity_name": "kg", + "entity_type": "TECHNOLOGY", + "description": "kg refers to a knowledge graph constructed to capture intricate relations within document blocks", + "source_ids": [ + 20 + ] + }, + { + "entity_name": "table of contents", + "entity_type": "PRODUCT", + "description": "the table of contents is the role served by the hierarchical tree structure in organizing the document s logical hierarchy", + "source_ids": [ + 20 + ] + }, + { + "entity_name": "parsed content blocks", + "entity_type": "MATERIAL", + "description": "parsed content blocks are the units of document content organized into a hierarchical tree structure", + "source_ids": [ + 20 + ] + }, + { + "entity_name": "fine grained entities", + "entity_type": "DATASET_OR_CORPUS", + "description": "fine grained entities are the specific data points contained within the document blocks that are captured by the knowledge graph", + "source_ids": [ + 20 + ] + }, + { + "entity_name": "relation", + "entity_type": "CONCEPT", + "description": "the relation refers to the deep connections within the document that the method aims to capture", + "source_ids": [ + 20 + ] + }, + { + "entity_name": "tree nodes", + "entity_type": "PRODUCT", + "description": "tree nodes are the specific components of the hierarchical tree structure to which kg entities are mapped", + "source_ids": [ + 20 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is built upon the document native bookindex", + "source_ids": [ + 20 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "document qa tasks", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is designed specifically for document qa tasks", + "source_ids": [ + 20 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "hierarchical tree structure", + "relation_name": "", + "weight": 9.0, + "description": "bookindex organizes information using a hierarchical tree structure to preserve logical hierarchy", + "source_ids": [ + 20 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "bookindex constructs a kg to capture intricate relations within document blocks", + "source_ids": [ + 20 + ] + }, + { + "src_entity_name": "hierarchical tree structure", + "tgt_entity_name": "table of contents", + "relation_name": "", + "weight": 8.0, + "description": "the hierarchical tree structure serves the role of the document s table of contents", + "source_ids": [ + 20 + ] + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "hierarchical tree structure", + "relation_name": "", + "weight": 8.0, + "description": "the kg entities are mapped to their corresponding tree nodes to unify the two structures", + "source_ids": [ + 20 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "parsed content blocks", + "relation_name": "", + "weight": 9.0, + "description": "bookindex organizes parsed content blocks into a hierarchical tree structure", + "source_ids": [ + 20 + ] + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "fine grained entities", + "relation_name": "", + "weight": 9.0, + "description": "the kg is constructed containing fine grained entities to capture intricate relations", + "source_ids": [ + 20 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "relation", + "relation_name": "", + "weight": 8.0, + "description": "bookrag is designed to capture the deep connection of the relation in the document", + "source_ids": [ + 20 + ] + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "tree nodes", + "relation_name": "", + "weight": 9.0, + "description": "kg entities are mapped to their corresponding tree nodes to unify the structures", + "source_ids": [ + 20 + ] + }, + { + "src_entity_name": "parsed content blocks", + "tgt_entity_name": "tree nodes", + "relation_name": "", + "weight": 7.0, + "description": "parsed content blocks are organized into the hierarchical tree structure which consists of tree nodes", + "source_ids": [ + 20 + ] + } + ], + "node_idx": 20 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_200.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_200.json new file mode 100644 index 0000000..c30b306 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_200.json @@ -0,0 +1,545 @@ +{ + "entities": [ + { + "entity_name": "sibei chen", + "entity_type": "PERSON", + "description": "sibei chen is listed as one of the authors of the paper titled haipipe", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "nan tang", + "entity_type": "PERSON", + "description": "nan tang is listed as one of the authors of the paper titled haipipe", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "ju fan", + "entity_type": "PERSON", + "description": "ju fan is listed as one of the authors of the paper titled haipipe", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "xuemi yan", + "entity_type": "PERSON", + "description": "xuemi yan is listed as one of the authors of the paper titled haipipe", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "chengliang chai", + "entity_type": "PERSON", + "description": "chengliang chai is listed as one of the authors of the paper titled haipipe", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "guoliang li", + "entity_type": "PERSON", + "description": "guoliang li is listed as one of the authors of the paper titled haipipe", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "xiaoyong du", + "entity_type": "PERSON", + "description": "xiaoyong du is listed as one of the authors of the paper titled haipipe", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "haipipe", + "entity_type": "PRODUCT", + "description": "haipipe is a system or method described in the paper that combines human generated and machine generated pipelines for data preparation", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "2023", + "entity_type": "DATE", + "description": "2023 is the year the paper was published", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "proceedings of the acm on management of data", + "entity_type": "PUBLICATION_VENUE", + "description": "proceedings of the acm on management of data is the venue where the paper was published", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "1 26", + "entity_type": "MEASUREMENT", + "description": "1 26 refers to the page range of the paper in the publication", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "acm", + "entity_type": "ORGANIZATION", + "description": "acm is the organization associated with the publication venue mentioned in the text", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "data preparation", + "entity_type": "TASK_OR_PROBLEM", + "description": "data preparation is the specific task addressed by the haipipe system described in the text", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "human generated pipelines", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "human generated pipelines are a type of pipeline combined with machine generated ones in the haipipe system", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "machine generated pipelines", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "machine generated pipelines are a type of pipeline combined with human generated ones in the haipipe system", + "source_ids": [ + 200 + ] + }, + { + "entity_name": "1", + "entity_type": "MEASUREMENT", + "description": "1 refers to the page count or a specific metric mentioned in the context of the publication details", + "source_ids": [ + 200 + ] + } + ], + "relations": [ + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "sibei chen is an author of the paper describing haipipe", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "nan tang is an author of the paper describing haipipe", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "ju fan is an author of the paper describing haipipe", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "xuemi yan", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "xuemi yan is an author of the paper describing haipipe", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "chengliang chai is an author of the paper describing haipipe", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "guoliang li", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "guoliang li is an author of the paper describing haipipe", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "xiaoyong du", + "tgt_entity_name": "haipipe", + "relation_name": "", + "weight": 9.0, + "description": "xiaoyong du is an author of the paper describing haipipe", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "proceedings of the acm on management of data", + "relation_name": "", + "weight": 10.0, + "description": "haipipe is published in the proceedings of the acm on management of data", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "nan tang", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and nan tang are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "ju fan", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and ju fan are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "xuemi yan", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and xuemi yan are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "chengliang chai", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and chengliang chai are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "guoliang li", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and guoliang li are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "sibei chen and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "ju fan", + "relation_name": "", + "weight": 8.0, + "description": "nan tang and ju fan are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "xuemi yan", + "relation_name": "", + "weight": 8.0, + "description": "nan tang and xuemi yan are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "chengliang chai", + "relation_name": "", + "weight": 8.0, + "description": "nan tang and chengliang chai are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "guoliang li", + "relation_name": "", + "weight": 8.0, + "description": "nan tang and guoliang li are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "nan tang and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "xuemi yan", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and xuemi yan are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "chengliang chai", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and chengliang chai are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "guoliang li", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and guoliang li are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "ju fan and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "xuemi yan", + "tgt_entity_name": "chengliang chai", + "relation_name": "", + "weight": 8.0, + "description": "xuemi yan and chengliang chai are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "xuemi yan", + "tgt_entity_name": "guoliang li", + "relation_name": "", + "weight": 8.0, + "description": "xuemi yan and guoliang li are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "xuemi yan", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "xuemi yan and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "guoliang li", + "relation_name": "", + "weight": 8.0, + "description": "chengliang chai and guoliang li are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "chengliang chai and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "guoliang li", + "tgt_entity_name": "xiaoyong du", + "relation_name": "", + "weight": 8.0, + "description": "guoliang li and xiaoyong du are co authors on the same paper", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "2023", + "relation_name": "", + "weight": 9.0, + "description": "haipipe was published in the year 2023", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "data preparation", + "relation_name": "", + "weight": 10.0, + "description": "haipipe is a system designed for data preparation", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "human generated pipelines", + "relation_name": "", + "weight": 9.0, + "description": "haipipe combines human generated pipelines as part of its methodology", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "machine generated pipelines", + "relation_name": "", + "weight": 9.0, + "description": "haipipe combines machine generated pipelines as part of its methodology", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "haipipe", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 8.0, + "description": "haipipe is published by the acm organization", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "sibei chen", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "sibei chen is an author of a paper published by the acm", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "nan tang", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "nan tang is an author of a paper published by the acm", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "ju fan", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "ju fan is an author of a paper published by the acm", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "xuemi yan", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "xuemi yan is an author of a paper published by the acm", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "chengliang chai", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "chengliang chai is an author of a paper published by the acm", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "guoliang li", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "guoliang li is an author of a paper published by the acm", + "source_ids": [ + 200 + ] + }, + { + "src_entity_name": "xiaoyong du", + "tgt_entity_name": "acm", + "relation_name": "", + "weight": 7.0, + "description": "xiaoyong du is an author of a paper published by the acm", + "source_ids": [ + 200 + ] + } + ], + "node_idx": 200 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_201.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_201.json new file mode 100644 index 0000000..ca50895 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_201.json @@ -0,0 +1,323 @@ +{ + "entities": [ + { + "entity_name": "jaemin cho", + "entity_type": "PERSON", + "description": "jaemin cho is an author of the 2024 arxiv preprint titled m3docrag", + "source_ids": [ + 201 + ] + }, + { + "entity_name": "debanjan mahata", + "entity_type": "PERSON", + "description": "debanjan mahata is an author of the 2024 arxiv preprint titled m3docrag", + "source_ids": [ + 201 + ] + }, + { + "entity_name": "ozan irsoy", + "entity_type": "PERSON", + "description": "ozan irsoy is an author of the 2024 arxiv preprint titled m3docrag", + "source_ids": [ + 201 + ] + }, + { + "entity_name": "yujie he", + "entity_type": "PERSON", + "description": "yujie he is an author of the 2024 arxiv preprint titled m3docrag", + "source_ids": [ + 201 + ] + }, + { + "entity_name": "mohit bansal", + "entity_type": "PERSON", + "description": "mohit bansal is an author of the 2024 arxiv preprint titled m3docrag", + "source_ids": [ + 201 + ] + }, + { + "entity_name": "m3docrag", + "entity_type": "PRODUCT", + "description": "m3docrag is a multi modal retrieval system designed for multi page multidocument understanding", + "source_ids": [ + 201 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv is the platform where the preprint m3docrag was published", + "source_ids": [ + 201 + ] + }, + { + "entity_name": "2024", + "entity_type": "DATE", + "description": "2024 is the year the m3docrag preprint was published", + "source_ids": [ + 201 + ] + }, + { + "entity_name": "arxiv 2411 04952", + "entity_type": "FILE_TYPE", + "description": "arxiv 2411 04952 is the specific identifier for the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "entity_name": "multi modal retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "multi modal retrieval is the technique described as what is needed for multi page multidocument understanding in the m3docrag paper", + "source_ids": [ + 201 + ] + }, + { + "entity_name": "multi page multidocument understanding", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi page multidocument understanding is the specific task or problem that the m3docrag system addresses", + "source_ids": [ + 201 + ] + }, + { + "entity_name": "arxiv preprint", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint is the type of publication venue where the m3docrag paper was released", + "source_ids": [ + 201 + ] + } + ], + "relations": [ + { + "src_entity_name": "jaemin cho", + "tgt_entity_name": "m3docrag", + "relation_name": "", + "weight": 10.0, + "description": "jaemin cho is an author of the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "debanjan mahata", + "tgt_entity_name": "m3docrag", + "relation_name": "", + "weight": 10.0, + "description": "debanjan mahata is an author of the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "ozan irsoy", + "tgt_entity_name": "m3docrag", + "relation_name": "", + "weight": 10.0, + "description": "ozan irsoy is an author of the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "yujie he", + "tgt_entity_name": "m3docrag", + "relation_name": "", + "weight": 10.0, + "description": "yujie he is an author of the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "mohit bansal", + "tgt_entity_name": "m3docrag", + "relation_name": "", + "weight": 10.0, + "description": "mohit bansal is an author of the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "m3docrag was published as a preprint on arxiv", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "m3docrag was published in the year 2024", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "arxiv 2411 04952", + "relation_name": "", + "weight": 9.0, + "description": "m3docrag is identified by the file type arxiv 2411 04952", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "jaemin cho", + "tgt_entity_name": "debanjan mahata", + "relation_name": "", + "weight": 8.0, + "description": "jaemin cho and debanjan mahata are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "jaemin cho", + "tgt_entity_name": "ozan irsoy", + "relation_name": "", + "weight": 8.0, + "description": "jaemin cho and ozan irsoy are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "jaemin cho", + "tgt_entity_name": "yujie he", + "relation_name": "", + "weight": 8.0, + "description": "jaemin cho and yujie he are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "jaemin cho", + "tgt_entity_name": "mohit bansal", + "relation_name": "", + "weight": 8.0, + "description": "jaemin cho and mohit bansal are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "debanjan mahata", + "tgt_entity_name": "ozan irsoy", + "relation_name": "", + "weight": 8.0, + "description": "debanjan mahata and ozan irsoy are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "debanjan mahata", + "tgt_entity_name": "yujie he", + "relation_name": "", + "weight": 8.0, + "description": "debanjan mahata and yujie he are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "debanjan mahata", + "tgt_entity_name": "mohit bansal", + "relation_name": "", + "weight": 8.0, + "description": "debanjan mahata and mohit bansal are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "ozan irsoy", + "tgt_entity_name": "yujie he", + "relation_name": "", + "weight": 8.0, + "description": "ozan irsoy and yujie he are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "ozan irsoy", + "tgt_entity_name": "mohit bansal", + "relation_name": "", + "weight": 8.0, + "description": "ozan irsoy and mohit bansal are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "yujie he", + "tgt_entity_name": "mohit bansal", + "relation_name": "", + "weight": 8.0, + "description": "yujie he and mohit bansal are co authors on the m3docrag preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "multi modal retrieval", + "relation_name": "", + "weight": 9.0, + "description": "m3docrag utilizes multi modal retrieval as its core technique", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "multi page multidocument understanding", + "relation_name": "", + "weight": 10.0, + "description": "m3docrag is designed to solve the problem of multi page multidocument understanding", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "m3docrag", + "tgt_entity_name": "arxiv preprint", + "relation_name": "", + "weight": 9.0, + "description": "m3docrag was published as an arxiv preprint", + "source_ids": [ + 201 + ] + }, + { + "src_entity_name": "multi modal retrieval", + "tgt_entity_name": "multi page multidocument understanding", + "relation_name": "", + "weight": 8.0, + "description": "multi modal retrieval is identified as the necessary method for achieving multi page multidocument understanding", + "source_ids": [ + 201 + ] + } + ], + "node_idx": 201 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_202.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_202.json new file mode 100644 index 0000000..fc023aa --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_202.json @@ -0,0 +1,469 @@ +{ + "entities": [ + { + "entity_name": "vassilis christophides", + "entity_type": "PERSON", + "description": "vassilis christophides is an author of the 2020 paper on end to end entity resolution for big data", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "vasilis efthymiou", + "entity_type": "PERSON", + "description": "vasilis efthymiou is an author of the 2020 paper on end to end entity resolution for big data", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "themis palpanas", + "entity_type": "PERSON", + "description": "themis palpanas is an author of the 2020 paper on end to end entity resolution for big data", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "george papadakis", + "entity_type": "PERSON", + "description": "george papadakis is an author of the 2020 paper on end to end entity resolution for big data", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "kostas stefanidis", + "entity_type": "PERSON", + "description": "kostas stefanidis is an author of the 2020 paper on end to end entity resolution for big data", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "2020", + "entity_type": "DATE", + "description": "2020 is the year the paper was published", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "acm computing surveys", + "entity_type": "PUBLICATION_VENUE", + "description": "acm computing surveys is the journal where the paper was published", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "an overview of end to end entity resolution for big data", + "entity_type": "BOOK", + "description": "an overview of end to end entity resolution for big data is the title of the paper discussed in the text", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "csur", + "entity_type": "PUBLICATION_VENUE", + "description": "csur is the abbreviation for acm computing surveys the journal where the paper was published", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "53", + "entity_type": "MEASUREMENT", + "description": "53 is the volume number of the journal acm computing surveys where the paper was published", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "6", + "entity_type": "MEASUREMENT", + "description": "6 is the issue number of the journal acm computing surveys where the paper was published", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "1 42", + "entity_type": "MEASUREMENT", + "description": "1 42 represents the page range of the paper within the journal", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "end to end entity resolution", + "entity_type": "TASK_OR_PROBLEM", + "description": "end to end entity resolution is the specific technical problem addressed in the paper", + "source_ids": [ + 202 + ] + }, + { + "entity_name": "big data", + "entity_type": "DATASET_OR_CORPUS", + "description": "big data is the domain or subject matter discussed in the paper", + "source_ids": [ + 202 + ] + } + ], + "relations": [ + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "vasilis efthymiou", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "themis palpanas", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "george papadakis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "kostas stefanidis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "vassilis christophides is an author of a paper published in 2020", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 8.0, + "description": "vassilis christophides is an author of a paper published in acm computing surveys", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vassilis christophides", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "vassilis christophides is an author of the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "themis palpanas", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "george papadakis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "kostas stefanidis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "vasilis efthymiou is an author of a paper published in 2020", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 8.0, + "description": "vasilis efthymiou is an author of a paper published in acm computing surveys", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "vasilis efthymiou", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "vasilis efthymiou is an author of the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "themis palpanas", + "tgt_entity_name": "george papadakis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "themis palpanas", + "tgt_entity_name": "kostas stefanidis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "themis palpanas", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "themis palpanas is an author of a paper published in 2020", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "themis palpanas", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 8.0, + "description": "themis palpanas is an author of a paper published in acm computing surveys", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "themis palpanas", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "themis palpanas is an author of the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "george papadakis", + "tgt_entity_name": "kostas stefanidis", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "george papadakis", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "george papadakis is an author of a paper published in 2020", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "george papadakis", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 8.0, + "description": "george papadakis is an author of a paper published in acm computing surveys", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "george papadakis", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "george papadakis is an author of the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "kostas stefanidis", + "tgt_entity_name": "2020", + "relation_name": "", + "weight": 8.0, + "description": "kostas stefanidis is an author of a paper published in 2020", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "kostas stefanidis", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 8.0, + "description": "kostas stefanidis is an author of a paper published in acm computing surveys", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "kostas stefanidis", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "kostas stefanidis is an author of the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "2020", + "tgt_entity_name": "acm computing surveys", + "relation_name": "", + "weight": 9.0, + "description": "the paper was published in acm computing surveys in the year 2020", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "2020", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "the paper titled an overview of end to end entity resolution for big data was published in 2020", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "acm computing surveys", + "tgt_entity_name": "an overview of end to end entity resolution for big data", + "relation_name": "", + "weight": 10.0, + "description": "acm computing surveys is the publication venue for the paper titled an overview of end to end entity resolution for big data", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "acm computing surveys", + "tgt_entity_name": "csur", + "relation_name": "", + "weight": 10.0, + "description": "csur is the abbreviation used for the publication venue acm computing surveys", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "acm computing surveys", + "tgt_entity_name": "53", + "relation_name": "", + "weight": 9.0, + "description": "the paper was published in volume 53 of acm computing surveys", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "acm computing surveys", + "tgt_entity_name": "6", + "relation_name": "", + "weight": 9.0, + "description": "the paper was published in issue 6 of acm computing surveys", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "acm computing surveys", + "tgt_entity_name": "1 42", + "relation_name": "", + "weight": 9.0, + "description": "the paper spans pages 1 42 in acm computing surveys", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "an overview of end to end entity resolution for big data", + "tgt_entity_name": "end to end entity resolution", + "relation_name": "", + "weight": 10.0, + "description": "the paper title indicates it provides an overview of the task of end to end entity resolution", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "an overview of end to end entity resolution for big data", + "tgt_entity_name": "big data", + "relation_name": "", + "weight": 10.0, + "description": "the paper title indicates it discusses the application of entity resolution to big data", + "source_ids": [ + 202 + ] + }, + { + "src_entity_name": "end to end entity resolution", + "tgt_entity_name": "big data", + "relation_name": "", + "weight": 8.0, + "description": "the text links the task of end to end entity resolution with the domain of big data", + "source_ids": [ + 202 + ] + } + ], + "node_idx": 202 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_203.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_203.json new file mode 100644 index 0000000..cfe9861 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_203.json @@ -0,0 +1,475 @@ +{ + "entities": [ + { + "entity_name": "gheorghe comanici", + "entity_type": "PERSON", + "description": "gheorghe comanici is listed as one of the authors of the paper", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "eric bieber", + "entity_type": "PERSON", + "description": "eric bieber is listed as one of the authors of the paper", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "mike schaekermann", + "entity_type": "PERSON", + "description": "mike schaekermann is listed as one of the authors of the paper", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "ice pasupat", + "entity_type": "PERSON", + "description": "ice pasupat is listed as one of the authors of the paper", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "noveen sachdeva", + "entity_type": "PERSON", + "description": "noveen sachdeva is listed as one of the authors of the paper", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "inderjit dhillon", + "entity_type": "PERSON", + "description": "inderjit dhillon is listed as one of the authors of the paper", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "marcel blistein", + "entity_type": "PERSON", + "description": "marcel blistein is listed as one of the authors of the paper", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "ori ram", + "entity_type": "PERSON", + "description": "ori ram is listed as one of the authors of the paper", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "dan zhang", + "entity_type": "PERSON", + "description": "dan zhang is listed as one of the authors of the paper", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "evan rosen", + "entity_type": "PERSON", + "description": "evan rosen is listed as one of the authors of the paper", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "gemini 2 5", + "entity_type": "PRODUCT", + "description": "gemini 2 5 is a product described as pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv is the venue where the preprint is published", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "2025", + "entity_type": "DATE", + "description": "2025 is the year the paper was published", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "arxiv 2507 06261", + "entity_type": "FILE_TYPE", + "description": "arxiv 2507 06261 is the specific identifier for the preprint document", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "et al", + "entity_type": "PERSON", + "description": "et al indicates additional authors not explicitly listed in the text", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "advanced reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "advanced reasoning is a capability of gemini 2 5 mentioned in the text", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "multimodality", + "entity_type": "TASK_OR_PROBLEM", + "description": "multimodality is a capability of gemini 2 5 mentioned in the text", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "long context", + "entity_type": "TASK_OR_PROBLEM", + "description": "long context is a capability of gemini 2 5 mentioned in the text", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "next generation agentic capabilities", + "entity_type": "TASK_OR_PROBLEM", + "description": "next generation agentic capabilities are capabilities of gemini 2 5 mentioned in the text", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "gemini 2 5 pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities", + "entity_type": "BOOK", + "description": "gemini 2 5 pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities is the title of the paper", + "source_ids": [ + 203 + ] + }, + { + "entity_name": "arxiv preprint", + "entity_type": "FILE_TYPE", + "description": "arxiv preprint describes the type of document published", + "source_ids": [ + 203 + ] + } + ], + "relations": [ + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "gheorghe comanici is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "eric bieber", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "eric bieber is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "mike schaekermann", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "mike schaekermann is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "ice pasupat", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "ice pasupat is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "noveen sachdeva", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "noveen sachdeva is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "inderjit dhillon", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "inderjit dhillon is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "marcel blistein", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "marcel blistein is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "ori ram", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "ori ram is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "dan zhang", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "dan zhang is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "evan rosen", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 9.0, + "description": "evan rosen is an author of the paper describing gemini 2 5", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici is an author of the paper published on arxiv", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the paper describing gemini 2 5 is published on arxiv", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 8.0, + "description": "gemini 2 5 is the subject of a paper published in 2025", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "arxiv 2507 06261", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 10.0, + "description": "arxiv 2507 06261 is the specific identifier for the paper on arxiv", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "eric bieber", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and eric bieber are co authors on the same paper", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "mike schaekermann", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and mike schaekermann are co authors on the same paper", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "ice pasupat", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and ice pasupat are co authors on the same paper", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "noveen sachdeva", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and noveen sachdeva are co authors on the same paper", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "inderjit dhillon", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and inderjit dhillon are co authors on the same paper", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "marcel blistein", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and marcel blistein are co authors on the same paper", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "ori ram", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and ori ram are co authors on the same paper", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "dan zhang", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and dan zhang are co authors on the same paper", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "evan rosen", + "relation_name": "", + "weight": 8.0, + "description": "gheorghe comanici and evan rosen are co authors on the same paper", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gheorghe comanici", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 9.0, + "description": "gheorghe comanici is listed before et al indicating they are among the authors represented by the abbreviation", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "advanced reasoning", + "relation_name": "", + "weight": 10.0, + "description": "gemini 2 5 is described as having advanced reasoning capabilities", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "multimodality", + "relation_name": "", + "weight": 10.0, + "description": "gemini 2 5 is described as having multimodality capabilities", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "long context", + "relation_name": "", + "weight": 10.0, + "description": "gemini 2 5 is described as having long context capabilities", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gemini 2 5", + "tgt_entity_name": "next generation agentic capabilities", + "relation_name": "", + "weight": 10.0, + "description": "gemini 2 5 is described as having next generation agentic capabilities", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "gemini 2 5 pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 10.0, + "description": "the title refers to the product gemini 2 5", + "source_ids": [ + 203 + ] + }, + { + "src_entity_name": "arxiv 2507 06261", + "tgt_entity_name": "arxiv preprint", + "relation_name": "", + "weight": 10.0, + "description": "arxiv 2507 06261 is identified as an arxiv preprint", + "source_ids": [ + 203 + ] + } + ], + "node_idx": 203 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_204.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_204.json new file mode 100644 index 0000000..f5c4695 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_204.json @@ -0,0 +1,273 @@ +{ + "entities": [ + { + "entity_name": "pradeep dasigi", + "entity_type": "PERSON", + "description": "pradeep dasigi is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ] + }, + { + "entity_name": "kyle lo", + "entity_type": "PERSON", + "description": "kyle lo is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ] + }, + { + "entity_name": "iz beltagy", + "entity_type": "PERSON", + "description": "iz beltagy is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ] + }, + { + "entity_name": "arman cohan", + "entity_type": "PERSON", + "description": "arman cohan is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ] + }, + { + "entity_name": "noah a smith", + "entity_type": "PERSON", + "description": "noah a smith is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ] + }, + { + "entity_name": "matt gardner", + "entity_type": "PERSON", + "description": "matt gardner is one of the authors of the 2021 arxiv preprint", + "source_ids": [ + 204 + ] + }, + { + "entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "entity_type": "PRODUCT", + "description": "a dataset of information seeking questions and answers anchored in research papers is the title of the work described in the text", + "source_ids": [ + 204 + ] + }, + { + "entity_name": "arxiv preprint arxiv 2105 03011", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint arxiv 2105 03011 is the specific publication venue and identifier for the work", + "source_ids": [ + 204 + ] + }, + { + "entity_name": "2021", + "entity_type": "DATE", + "description": "2021 is the year the preprint was published", + "source_ids": [ + 204 + ] + }, + { + "entity_name": "research papers", + "entity_type": "DATASET_OR_CORPUS", + "description": "research papers are the source material from which the information seeking questions and answers are anchored", + "source_ids": [ + 204 + ] + }, + { + "entity_name": "information seeking questions", + "entity_type": "TASK_OR_PROBLEM", + "description": "information seeking questions are the specific type of queries included in the dataset", + "source_ids": [ + 204 + ] + }, + { + "entity_name": "answers", + "entity_type": "TASK_OR_PROBLEM", + "description": "answers are the responses paired with the questions in the dataset", + "source_ids": [ + 204 + ] + } + ], + "relations": [ + { + "src_entity_name": "pradeep dasigi", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "pradeep dasigi is an author of the dataset work", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "kyle lo", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "kyle lo is an author of the dataset work", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "iz beltagy", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "iz beltagy is an author of the dataset work", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "arman cohan", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "arman cohan is an author of the dataset work", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "noah a smith", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "noah a smith is an author of the dataset work", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "matt gardner", + "tgt_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "relation_name": "", + "weight": 10.0, + "description": "matt gardner is an author of the dataset work", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "pradeep dasigi", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "pradeep dasigi is an author of the work published in this venue", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "kyle lo", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "kyle lo is an author of the work published in this venue", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "iz beltagy", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "iz beltagy is an author of the work published in this venue", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "arman cohan", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "arman cohan is an author of the work published in this venue", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "noah a smith", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "noah a smith is an author of the work published in this venue", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "matt gardner", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 9.0, + "description": "matt gardner is an author of the work published in this venue", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "tgt_entity_name": "arxiv preprint arxiv 2105 03011", + "relation_name": "", + "weight": 10.0, + "description": "the dataset work is published as the arxiv preprint arxiv 2105 03011", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "tgt_entity_name": "2021", + "relation_name": "", + "weight": 10.0, + "description": "the dataset work was published in the year 2021", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "tgt_entity_name": "research papers", + "relation_name": "", + "weight": 10.0, + "description": "the dataset is anchored in research papers meaning it derives its content from them", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "tgt_entity_name": "information seeking questions", + "relation_name": "", + "weight": 10.0, + "description": "the dataset consists of information seeking questions", + "source_ids": [ + 204 + ] + }, + { + "src_entity_name": "a dataset of information seeking questions and answers anchored in research papers", + "tgt_entity_name": "answers", + "relation_name": "", + "weight": 10.0, + "description": "the dataset consists of answers corresponding to the questions", + "source_ids": [ + 204 + ] + } + ], + "node_idx": 204 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_205.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_205.json new file mode 100644 index 0000000..ef7429b --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_205.json @@ -0,0 +1,237 @@ +{ + "entities": [ + { + "entity_name": "xavier daull", + "entity_type": "PERSON", + "description": "xavier daull is one of the authors of the 2023 survey on complex qa and language models hybrid architectures", + "source_ids": [ + 205 + ] + }, + { + "entity_name": "patrice bellot", + "entity_type": "PERSON", + "description": "patrice bellot is one of the authors of the 2023 survey on complex qa and language models hybrid architectures", + "source_ids": [ + 205 + ] + }, + { + "entity_name": "emmanuel bruno", + "entity_type": "PERSON", + "description": "emmanuel bruno is one of the authors of the 2023 survey on complex qa and language models hybrid architectures", + "source_ids": [ + 205 + ] + }, + { + "entity_name": "vincent martin", + "entity_type": "PERSON", + "description": "vincent martin is one of the authors of the 2023 survey on complex qa and language models hybrid architectures", + "source_ids": [ + 205 + ] + }, + { + "entity_name": "elisabeth murisasco", + "entity_type": "PERSON", + "description": "elisabeth murisasco is one of the authors of the 2023 survey on complex qa and language models hybrid architectures", + "source_ids": [ + 205 + ] + }, + { + "entity_name": "2023", + "entity_type": "DATE", + "description": "2023 is the year the survey was published and the year associated with the arxiv preprint", + "source_ids": [ + 205 + ] + }, + { + "entity_name": "arxiv preprint arxiv 2302 09051", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint arxiv 2302 09051 is the specific identifier for the preprint where the survey was published", + "source_ids": [ + 205 + ] + }, + { + "entity_name": "complex qa and language models hybrid architectures survey", + "entity_type": "BOOK", + "description": "complex qa and language models hybrid architectures survey is the title of the work authored by the listed individuals", + "source_ids": [ + 205 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "ORGANIZATION", + "description": "arxiv is the organization or platform hosting the preprint arxiv 2302 09051", + "source_ids": [ + 205 + ] + }, + { + "entity_name": "2302 09051", + "entity_type": "FILE_TYPE", + "description": "2302 09051 is the unique identifier code for the specific preprint document", + "source_ids": [ + 205 + ] + } + ], + "relations": [ + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "patrice bellot", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "emmanuel bruno", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "vincent martin", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "elisabeth murisasco", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "patrice bellot", + "tgt_entity_name": "emmanuel bruno", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "patrice bellot", + "tgt_entity_name": "vincent martin", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "patrice bellot", + "tgt_entity_name": "elisabeth murisasco", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "emmanuel bruno", + "tgt_entity_name": "vincent martin", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "emmanuel bruno", + "tgt_entity_name": "elisabeth murisasco", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "vincent martin", + "tgt_entity_name": "elisabeth murisasco", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same survey document", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "2023", + "relation_name": "", + "weight": 8.0, + "description": "xavier daull is an author of the work published in 2023", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "arxiv preprint arxiv 2302 09051", + "relation_name": "", + "weight": 8.0, + "description": "xavier daull is an author of the work identified by this preprint number", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "xavier daull", + "tgt_entity_name": "complex qa and language models hybrid architectures survey", + "relation_name": "", + "weight": 10.0, + "description": "xavier daull is the author of this specific survey title", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "arxiv preprint arxiv 2302 09051", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the preprint is hosted by the arxiv organization", + "source_ids": [ + 205 + ] + }, + { + "src_entity_name": "arxiv preprint arxiv 2302 09051", + "tgt_entity_name": "2302 09051", + "relation_name": "", + "weight": 10.0, + "description": "the preprint identifier contains the specific code 2302 09051", + "source_ids": [ + 205 + ] + } + ], + "node_idx": 205 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_206.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_206.json new file mode 100644 index 0000000..f9e4b6c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_206.json @@ -0,0 +1,735 @@ +{ + "entities": [ + { + "entity_name": "darren edge", + "entity_type": "PERSON", + "description": "darren edge is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "ha trinh", + "entity_type": "PERSON", + "description": "ha trinh is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "newman cheng", + "entity_type": "PERSON", + "description": "newman cheng is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "joshua bradley", + "entity_type": "PERSON", + "description": "joshua bradley is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "alex chao", + "entity_type": "PERSON", + "description": "alex chao is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "apurva mody", + "entity_type": "PERSON", + "description": "apurva mody is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "steven truitt", + "entity_type": "PERSON", + "description": "steven truitt is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "jonathan larson", + "entity_type": "PERSON", + "description": "jonathan larson is listed as one of the authors of the 2024 arxiv preprint", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "2024", + "entity_type": "DATE", + "description": "2024 is the year the arxiv preprint was published", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "from local to global a graph rag approach to query focused summarization", + "entity_type": "BOOK", + "description": "from local to global a graph rag approach to query focused summarization is the title of the arxiv preprint", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv is the venue where the preprint arxiv 2404 16130 was published", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "arxiv 2404 16130", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv 2404 16130 is the specific identifier for the preprint document", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "graph rag", + "entity_type": "TECHNOLOGY", + "description": "graph rag is a technology approach mentioned in the title of the paper as a method for query focused summarization", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "query focused summarization", + "entity_type": "TASK_OR_PROBLEM", + "description": "query focused summarization is the specific task or problem addressed by the graph rag approach in the paper", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "local", + "entity_type": "CONCEPT", + "description": "local refers to a scope or scale mentioned in the paper s title contrasting with global", + "source_ids": [ + 206 + ] + }, + { + "entity_name": "global", + "entity_type": "CONCEPT", + "description": "global refers to a scope or scale mentioned in the paper s title contrasting with local", + "source_ids": [ + 206 + ] + } + ], + "relations": [ + { + "src_entity_name": "darren edge", + "tgt_entity_name": "ha trinh", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "newman cheng", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "joshua bradley", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "alex chao", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "apurva mody", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "newman cheng", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "joshua bradley", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "alex chao", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "apurva mody", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "joshua bradley", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "alex chao", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "apurva mody", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "alex chao", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "apurva mody", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "apurva mody", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "apurva mody", + "tgt_entity_name": "steven truitt", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "apurva mody", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "steven truitt", + "tgt_entity_name": "jonathan larson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "darren edge is an author of a document published in 2024", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "ha trinh is an author of a document published in 2024", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "newman cheng is an author of a document published in 2024", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "joshua bradley is an author of a document published in 2024", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "alex chao is an author of a document published in 2024", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "apurva mody", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "apurva mody is an author of a document published in 2024", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "steven truitt", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "steven truitt is an author of a document published in 2024", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "jonathan larson", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "jonathan larson is an author of a document published in 2024", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "darren edge is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "ha trinh is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "newman cheng is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "joshua bradley is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "alex chao is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "apurva mody", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "apurva mody is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "steven truitt", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "steven truitt is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "jonathan larson", + "tgt_entity_name": "from local to global a graph rag approach to query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "jonathan larson is an author of the document titled from local to global a graph rag approach to query focused summarization", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "darren edge", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "darren edge is an author of a document published in arxiv", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "ha trinh", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "ha trinh is an author of a document published in arxiv", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "newman cheng", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "newman cheng is an author of a document published in arxiv", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "joshua bradley", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "joshua bradley is an author of a document published in arxiv", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "alex chao", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "alex chao is an author of a document published in arxiv", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "apurva mody", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "apurva mody is an author of a document published in arxiv", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "steven truitt", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "steven truitt is an author of a document published in arxiv", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "jonathan larson", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "jonathan larson is an author of a document published in arxiv", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "from local to global a graph rag approach to query focused summarization", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 10.0, + "description": "the document from local to global a graph rag approach to query focused summarization is published in arxiv", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "from local to global a graph rag approach to query focused summarization", + "tgt_entity_name": "arxiv 2404 16130", + "relation_name": "", + "weight": 10.0, + "description": "the document from local to global a graph rag approach to query focused summarization is identified by the preprint number arxiv 2404 16130", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "2024", + "tgt_entity_name": "arxiv 2404 16130", + "relation_name": "", + "weight": 9.0, + "description": "the preprint arxiv 2404 16130 was published in 2024", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "graph rag", + "tgt_entity_name": "query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "graph rag is the approach used to solve the task of query focused summarization", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "from local to global a graph rag approach to query focused summarization", + "tgt_entity_name": "graph rag", + "relation_name": "", + "weight": 10.0, + "description": "the paper title explicitly names graph rag as the core approach discussed", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "from local to global a graph rag approach to query focused summarization", + "tgt_entity_name": "query focused summarization", + "relation_name": "", + "weight": 10.0, + "description": "the paper title explicitly names query focused summarization as the target task", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "graph rag", + "tgt_entity_name": "local", + "relation_name": "", + "weight": 7.0, + "description": "the graph rag approach is described as a transition from local to global implying it handles local data", + "source_ids": [ + 206 + ] + }, + { + "src_entity_name": "graph rag", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 7.0, + "description": "the graph rag approach is described as a transition from local to global implying it handles global data", + "source_ids": [ + 206 + ] + } + ], + "node_idx": 206 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_207.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_207.json new file mode 100644 index 0000000..5c8dc17 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_207.json @@ -0,0 +1,655 @@ +{ + "entities": [ + { + "entity_name": "yunfan gao", + "entity_type": "PERSON", + "description": "yunfan gao is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "yun xiong", + "entity_type": "PERSON", + "description": "yun xiong is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "xinyu gao", + "entity_type": "PERSON", + "description": "xinyu gao is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "kangxiang jia", + "entity_type": "PERSON", + "description": "kangxiang jia is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "jinliu pan", + "entity_type": "PERSON", + "description": "jinliu pan is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "yuxi bi", + "entity_type": "PERSON", + "description": "yuxi bi is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "yi dai", + "entity_type": "PERSON", + "description": "yi dai is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "jiawei sun", + "entity_type": "PERSON", + "description": "jiawei sun is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "haofen wang", + "entity_type": "PERSON", + "description": "haofen wang is one of the authors of the 2023 survey on retrieval augmented generation", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "2023", + "entity_type": "DATE", + "description": "2023 is the year the survey was published and the year associated with the arxiv preprint", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "retrieval augmented generation for large language models a survey", + "entity_type": "BOOK", + "description": "retrieval augmented generation for large language models a survey is the title of the document authored by the listed individuals", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "arxiv preprint arxiv 2312 10997", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv preprint arxiv 2312 10997 is the specific identifier and venue where the survey was published", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "retrieval augmented generation", + "entity_type": "TECHNOLOGY", + "description": "retrieval augmented generation is the specific technology discussed in the survey", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "large language models", + "entity_type": "TECHNOLOGY", + "description": "large language models are the subject of the survey and the technology being augmented", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "ORGANIZATION", + "description": "arxiv is the organization or platform hosting the preprint", + "source_ids": [ + 207 + ] + }, + { + "entity_name": "2312 10997", + "entity_type": "FILE_TYPE", + "description": "2312 10997 is the unique identifier code for the preprint document", + "source_ids": [ + 207 + ] + } + ], + "relations": [ + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "yunfan gao is an author of the survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "yun xiong is an author of the survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "xinyu gao is an author of the survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "kangxiang jia is an author of the survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "jinliu pan", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "jinliu pan is an author of the survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yuxi bi", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "yuxi bi is an author of the survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yi dai", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "yi dai is an author of the survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "jiawei sun", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "jiawei sun is an author of the survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "haofen wang", + "tgt_entity_name": "retrieval augmented generation for large language models a survey", + "relation_name": "", + "weight": 10.0, + "description": "haofen wang is an author of the survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "retrieval augmented generation for large language models a survey", + "tgt_entity_name": "2023", + "relation_name": "", + "weight": 9.0, + "description": "the survey was published in the year 2023", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "retrieval augmented generation for large language models a survey", + "tgt_entity_name": "arxiv preprint arxiv 2312 10997", + "relation_name": "", + "weight": 10.0, + "description": "the survey is identified as the arxiv preprint with the number 2312 10997", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "yun xiong", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and yun xiong are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "xinyu gao", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and xinyu gao are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "kangxiang jia", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and kangxiang jia are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "jinliu pan", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and jinliu pan are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "yuxi bi", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and yuxi bi are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yunfan gao", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "yunfan gao and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "xinyu gao", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and xinyu gao are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "kangxiang jia", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and kangxiang jia are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "jinliu pan", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and jinliu pan are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "yuxi bi", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and yuxi bi are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yun xiong", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "yun xiong and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "kangxiang jia", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and kangxiang jia are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "jinliu pan", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and jinliu pan are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "yuxi bi", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and yuxi bi are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "xinyu gao", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "xinyu gao and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "jinliu pan", + "relation_name": "", + "weight": 8.0, + "description": "kangxiang jia and jinliu pan are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "yuxi bi", + "relation_name": "", + "weight": 8.0, + "description": "kangxiang jia and yuxi bi are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "kangxiang jia and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "kangxiang jia and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "kangxiang jia", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "kangxiang jia and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "jinliu pan", + "tgt_entity_name": "yuxi bi", + "relation_name": "", + "weight": 8.0, + "description": "jinliu pan and yuxi bi are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "jinliu pan", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "jinliu pan and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "jinliu pan", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "jinliu pan and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "jinliu pan", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "jinliu pan and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yuxi bi", + "tgt_entity_name": "yi dai", + "relation_name": "", + "weight": 8.0, + "description": "yuxi bi and yi dai are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yuxi bi", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "yuxi bi and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yuxi bi", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "yuxi bi and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yi dai", + "tgt_entity_name": "jiawei sun", + "relation_name": "", + "weight": 8.0, + "description": "yi dai and jiawei sun are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "yi dai", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "yi dai and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "jiawei sun", + "tgt_entity_name": "haofen wang", + "relation_name": "", + "weight": 8.0, + "description": "jiawei sun and haofen wang are co authors of the same survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "large language models", + "relation_name": "", + "weight": 10.0, + "description": "retrieval augmented generation is applied to large language models as described in the survey", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "retrieval augmented generation for large language models a survey", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 9.0, + "description": "the survey is about the technology of retrieval augmented generation", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "retrieval augmented generation for large language models a survey", + "tgt_entity_name": "large language models", + "relation_name": "", + "weight": 9.0, + "description": "the survey covers the topic of large language models", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "arxiv preprint arxiv 2312 10997", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 10.0, + "description": "the preprint is hosted by arxiv", + "source_ids": [ + 207 + ] + }, + { + "src_entity_name": "arxiv preprint arxiv 2312 10997", + "tgt_entity_name": "2312 10997", + "relation_name": "", + "weight": 10.0, + "description": "2312 10997 is the specific identifier for the arxiv preprint", + "source_ids": [ + 207 + ] + } + ], + "node_idx": 207 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_208.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_208.json new file mode 100644 index 0000000..fc6d5f9 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_208.json @@ -0,0 +1,353 @@ +{ + "entities": [ + { + "entity_name": "zirui guo", + "entity_type": "PERSON", + "description": "zirui guo is listed as an author of the paper titled lightrag", + "source_ids": [ + 208 + ] + }, + { + "entity_name": "lianghao xia", + "entity_type": "PERSON", + "description": "lianghao xia is listed as an author of the paper titled lightrag", + "source_ids": [ + 208 + ] + }, + { + "entity_name": "yanhua yu", + "entity_type": "PERSON", + "description": "yanhua yu is listed as an author of the paper titled lightrag", + "source_ids": [ + 208 + ] + }, + { + "entity_name": "tu ao", + "entity_type": "PERSON", + "description": "tu ao is listed as an author of the paper titled lightrag", + "source_ids": [ + 208 + ] + }, + { + "entity_name": "chao huang", + "entity_type": "PERSON", + "description": "chao huang is listed as an author of the paper titled lightrag", + "source_ids": [ + 208 + ] + }, + { + "entity_name": "lightrag", + "entity_type": "PRODUCT", + "description": "lightrag is a retrieval augmented generation system described as simple and fast", + "source_ids": [ + 208 + ] + }, + { + "entity_name": "arxiv e prints", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv e prints is the publication venue where the paper was released in 2024", + "source_ids": [ + 208 + ] + }, + { + "entity_name": "2024", + "entity_type": "DATE", + "description": "2024 is the year the paper was published and the date associated with the arxiv identifier", + "source_ids": [ + 208 + ] + }, + { + "entity_name": "arxiv2410", + "entity_type": "FILE_TYPE", + "description": "arxiv2410 is the specific identifier for the paper on arxiv", + "source_ids": [ + 208 + ] + }, + { + "entity_name": "retrieval augmented generation", + "entity_type": "TECHNOLOGY", + "description": "retrieval augmented generation is the technology category that lightrag belongs to as described in the text", + "source_ids": [ + 208 + ] + }, + { + "entity_name": "simple", + "entity_type": "CONCEPT", + "description": "simple is an attribute used to describe the lightrag system", + "source_ids": [ + 208 + ] + }, + { + "entity_name": "fast", + "entity_type": "CONCEPT", + "description": "fast is an attribute used to describe the lightrag system", + "source_ids": [ + 208 + ] + } + ], + "relations": [ + { + "src_entity_name": "zirui guo", + "tgt_entity_name": "lightrag", + "relation_name": "", + "weight": 10.0, + "description": "zirui guo is an author of the lightrag paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "lianghao xia", + "tgt_entity_name": "lightrag", + "relation_name": "", + "weight": 10.0, + "description": "lianghao xia is an author of the lightrag paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "yanhua yu", + "tgt_entity_name": "lightrag", + "relation_name": "", + "weight": 10.0, + "description": "yanhua yu is an author of the lightrag paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "tu ao", + "tgt_entity_name": "lightrag", + "relation_name": "", + "weight": 10.0, + "description": "tu ao is an author of the lightrag paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "chao huang", + "tgt_entity_name": "lightrag", + "relation_name": "", + "weight": 10.0, + "description": "chao huang is an author of the lightrag paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "arxiv e prints", + "relation_name": "", + "weight": 9.0, + "description": "lightrag was published in arxiv e prints", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "lightrag was published in the year 2024", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "arxiv2410", + "relation_name": "", + "weight": 9.0, + "description": "lightrag is identified by the arxiv identifier arxiv2410", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "zirui guo", + "tgt_entity_name": "lianghao xia", + "relation_name": "", + "weight": 8.0, + "description": "zirui guo and lianghao xia are co authors on the same paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "zirui guo", + "tgt_entity_name": "tu ao", + "relation_name": "", + "weight": 8.0, + "description": "zirui guo and tu ao are co authors on the same paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "zirui guo", + "tgt_entity_name": "chao huang", + "relation_name": "", + "weight": 8.0, + "description": "zirui guo and chao huang are co authors on the same paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "lianghao xia", + "tgt_entity_name": "yanhua yu", + "relation_name": "", + "weight": 8.0, + "description": "lianghao xia and yanhua yu are co authors on the same paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "lianghao xia", + "tgt_entity_name": "tu ao", + "relation_name": "", + "weight": 8.0, + "description": "lianghao xia and tu ao are co authors on the same paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "lianghao xia", + "tgt_entity_name": "chao huang", + "relation_name": "", + "weight": 8.0, + "description": "lianghao xia and chao huang are co authors on the same paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "yanhua yu", + "tgt_entity_name": "tu ao", + "relation_name": "", + "weight": 8.0, + "description": "yanhua yu and tu ao are co authors on the same paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "yanhua yu", + "tgt_entity_name": "chao huang", + "relation_name": "", + "weight": 8.0, + "description": "yanhua yu and chao huang are co authors on the same paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "tu ao", + "tgt_entity_name": "chao huang", + "relation_name": "", + "weight": 8.0, + "description": "tu ao and chao huang are co authors on the same paper", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 10.0, + "description": "lightrag is a type of retrieval augmented generation system", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "simple", + "relation_name": "", + "weight": 8.0, + "description": "lightrag is described as being simple", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "lightrag", + "tgt_entity_name": "fast", + "relation_name": "", + "weight": 8.0, + "description": "lightrag is described as being fast", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "zirui guo", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 7.0, + "description": "zirui guo is an author of a paper about retrieval augmented generation", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "lianghao xia", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 7.0, + "description": "lianghao xia is an author of a paper about retrieval augmented generation", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "yanhua yu", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 7.0, + "description": "yanhua yu is an author of a paper about retrieval augmented generation", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "tu ao", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 7.0, + "description": "tu ao is an author of a paper about retrieval augmented generation", + "source_ids": [ + 208 + ] + }, + { + "src_entity_name": "chao huang", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 7.0, + "description": "chao huang is an author of a paper about retrieval augmented generation", + "source_ids": [ + 208 + ] + } + ], + "node_idx": 208 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_209.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_209.json new file mode 100644 index 0000000..32872aa --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_209.json @@ -0,0 +1,285 @@ +{ + "entities": [ + { + "entity_name": "bernal jim nez guti rrez", + "entity_type": "PERSON", + "description": "bernal jim nez guti rrez is one of the authors of the paper titled hipporag", + "source_ids": [ + 209 + ] + }, + { + "entity_name": "yiheng shu", + "entity_type": "PERSON", + "description": "yiheng shu is one of the authors of the paper titled hipporag", + "source_ids": [ + 209 + ] + }, + { + "entity_name": "yu gu", + "entity_type": "PERSON", + "description": "yu gu is one of the authors of the paper titled hipporag", + "source_ids": [ + 209 + ] + }, + { + "entity_name": "michihiro yasunaga", + "entity_type": "PERSON", + "description": "michihiro yasunaga is one of the authors of the paper titled hipporag", + "source_ids": [ + 209 + ] + }, + { + "entity_name": "yu su", + "entity_type": "PERSON", + "description": "yu su is one of the authors of the paper titled hipporag", + "source_ids": [ + 209 + ] + }, + { + "entity_name": "hipporag", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "hipporag is a neurobiologically inspired long term memory system designed for large language models", + "source_ids": [ + 209 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv is the venue where the preprint of the paper was published", + "source_ids": [ + 209 + ] + }, + { + "entity_name": "2024", + "entity_type": "DATE", + "description": "2024 is the year the paper was published", + "source_ids": [ + 209 + ] + }, + { + "entity_name": "arxiv 2405 14831", + "entity_type": "FILE_TYPE", + "description": "arxiv 2405 14831 is the specific identifier for the preprint document", + "source_ids": [ + 209 + ] + }, + { + "entity_name": "large language models", + "entity_type": "PRODUCT", + "description": "large language models are the target systems for which hipporag is designed as a memory solution", + "source_ids": [ + 209 + ] + }, + { + "entity_name": "neurobiologically inspired long term memory", + "entity_type": "TASK_OR_PROBLEM", + "description": "neurobiologically inspired long term memory is the specific problem domain or concept that hipporag addresses", + "source_ids": [ + 209 + ] + } + ], + "relations": [ + { + "src_entity_name": "bernal jim nez guti rrez", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "bernal jim nez guti rrez is an author of the paper describing hipporag", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "yiheng shu", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "yiheng shu is an author of the paper describing hipporag", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "yu gu", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "yu gu is an author of the paper describing hipporag", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "michihiro yasunaga", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "michihiro yasunaga is an author of the paper describing hipporag", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "yu su", + "tgt_entity_name": "hipporag", + "relation_name": "", + "weight": 9.0, + "description": "yu su is an author of the paper describing hipporag", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "hipporag", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 10.0, + "description": "the paper describing hipporag was published as a preprint on arxiv", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "hipporag", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 10.0, + "description": "the paper describing hipporag was published in the year 2024", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "bernal jim nez guti rrez", + "tgt_entity_name": "yiheng shu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "bernal jim nez guti rrez", + "tgt_entity_name": "yu gu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "bernal jim nez guti rrez", + "tgt_entity_name": "michihiro yasunaga", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "bernal jim nez guti rrez", + "tgt_entity_name": "yu su", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "yiheng shu", + "tgt_entity_name": "yu gu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "yiheng shu", + "tgt_entity_name": "michihiro yasunaga", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "yiheng shu", + "tgt_entity_name": "yu su", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "yu gu", + "tgt_entity_name": "michihiro yasunaga", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "yu gu", + "tgt_entity_name": "yu su", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "michihiro yasunaga", + "tgt_entity_name": "yu su", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "hipporag", + "tgt_entity_name": "large language models", + "relation_name": "", + "weight": 10.0, + "description": "hipporag is explicitly designed to provide long term memory capabilities for large language models", + "source_ids": [ + 209 + ] + }, + { + "src_entity_name": "hipporag", + "tgt_entity_name": "neurobiologically inspired long term memory", + "relation_name": "", + "weight": 10.0, + "description": "hipporag is defined as a system for neurobiologically inspired long term memory", + "source_ids": [ + 209 + ] + } + ], + "node_idx": 209 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_21.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_21.json new file mode 100644 index 0000000..517288c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_21.json @@ -0,0 +1,205 @@ +{ + "entities": [ + { + "entity_name": "kg", + "entity_type": "CONCEPT", + "description": "kg refers to a knowledge graph which is a data structure used for multi hop reasoning", + "source_ids": [ + 21 + ] + }, + { + "entity_name": "llm", + "entity_type": "PRODUCT", + "description": "llm is an example of a distinct entity name mentioned in the context of entity ambiguity", + "source_ids": [ + 21 + ] + }, + { + "entity_name": "large language model", + "entity_type": "PRODUCT", + "description": "large language model is an example of a distinct entity name mentioned in the context of entity ambiguity", + "source_ids": [ + 21 + ] + }, + { + "entity_name": "gradient based entity resolution method", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the gradient based entity resolution method is a novel approach proposed to address entity ambiguity by analyzing similarity distributions", + "source_ids": [ + 21 + ] + }, + { + "entity_name": "multi hop reasoning", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi hop reasoning is a task that relies on a high quality knowledge graph", + "source_ids": [ + 21 + ] + }, + { + "entity_name": "entity ambiguity", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity ambiguity is a problem where distinct entities share similar names compromising the knowledge graph", + "source_ids": [ + 21 + ] + }, + { + "entity_name": "similarity distribution", + "entity_type": "CONCEPT", + "description": "similarity distribution is the data pattern analyzed by the proposed method to identify sharp drops in scores", + "source_ids": [ + 21 + ] + }, + { + "entity_name": "candidate entities", + "entity_type": "CONCEPT", + "description": "candidate entities are the potential matches analyzed to distinguish and merge coreferent entities", + "source_ids": [ + 21 + ] + }, + { + "entity_name": "coreferent entities", + "entity_type": "CONCEPT", + "description": "coreferent entities are distinct entities that refer to the same real world object and need to be merged", + "source_ids": [ + 21 + ] + }, + { + "entity_name": "graph connectivity", + "entity_type": "CONCEPT", + "description": "graph connectivity is a property of the knowledge graph that is ensured by the proposed method", + "source_ids": [ + 21 + ] + }, + { + "entity_name": "reasoning capabilities", + "entity_type": "CONCEPT", + "description": "reasoning capabilities are the skills of the system that are enhanced by the proposed method", + "source_ids": [ + 21 + ] + } + ], + "relations": [ + { + "src_entity_name": "kg", + "tgt_entity_name": "gradient based entity resolution method", + "relation_name": "", + "weight": 9.0, + "description": "the gradient based entity resolution method is proposed to ensure the high quality of the kg by resolving entity ambiguity", + "source_ids": [ + 21 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "large language model", + "relation_name": "", + "weight": 8.0, + "description": "llm and large language model are cited as examples of distinct entities that cause ambiguity in the kg", + "source_ids": [ + 21 + ] + }, + { + "src_entity_name": "multi hop reasoning", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 10.0, + "description": "multi hop reasoning relies on a high quality kg for its execution", + "source_ids": [ + 21 + ] + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "entity ambiguity", + "relation_name": "", + "weight": 9.0, + "description": "entity ambiguity compromises the quality of the kg", + "source_ids": [ + 21 + ] + }, + { + "src_entity_name": "gradient based entity resolution method", + "tgt_entity_name": "similarity distribution", + "relation_name": "", + "weight": 9.0, + "description": "the method analyzes the similarity distribution of candidate entities to function", + "source_ids": [ + 21 + ] + }, + { + "src_entity_name": "gradient based entity resolution method", + "tgt_entity_name": "candidate entities", + "relation_name": "", + "weight": 9.0, + "description": "the method analyzes candidate entities to identify sharp drops in similarity scores", + "source_ids": [ + 21 + ] + }, + { + "src_entity_name": "gradient based entity resolution method", + "tgt_entity_name": "coreferent entities", + "relation_name": "", + "weight": 10.0, + "description": "the method distinguishes and merges coreferent entities", + "source_ids": [ + 21 + ] + }, + { + "src_entity_name": "gradient based entity resolution method", + "tgt_entity_name": "graph connectivity", + "relation_name": "", + "weight": 8.0, + "description": "the method ensures graph connectivity by resolving entity ambiguity", + "source_ids": [ + 21 + ] + }, + { + "src_entity_name": "gradient based entity resolution method", + "tgt_entity_name": "reasoning capabilities", + "relation_name": "", + "weight": 8.0, + "description": "the method enhances reasoning capabilities by improving the kg", + "source_ids": [ + 21 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "entity ambiguity", + "relation_name": "", + "weight": 7.0, + "description": "llm is an example of a name that contributes to entity ambiguity", + "source_ids": [ + 21 + ] + }, + { + "src_entity_name": "large language model", + "tgt_entity_name": "entity ambiguity", + "relation_name": "", + "weight": 7.0, + "description": "large language model is an example of a name that contributes to entity ambiguity", + "source_ids": [ + 21 + ] + } + ], + "node_idx": 21 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_210.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_210.json new file mode 100644 index 0000000..64d74e8 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_210.json @@ -0,0 +1,105 @@ +{ + "entities": [ + { + "entity_name": "taher h haveliwala", + "entity_type": "PERSON", + "description": "taher h haveliwala is the author of the paper titled topic sensitive pagerank", + "source_ids": [ + 210 + ] + }, + { + "entity_name": "2002", + "entity_type": "DATE", + "description": "2002 is the year the paper topic sensitive pagerank was published", + "source_ids": [ + 210 + ] + }, + { + "entity_name": "topic sensitive pagerank", + "entity_type": "TECHNOLOGY", + "description": "topic sensitive pagerank is the title of a paper presented at a conference", + "source_ids": [ + 210 + ] + }, + { + "entity_name": "11th international conference on world wide web", + "entity_type": "EVENT", + "description": "the 11th international conference on world wide web is the venue where the paper was presented", + "source_ids": [ + 210 + ] + }, + { + "entity_name": "world wide web", + "entity_type": "TECHNOLOGY", + "description": "world wide web is the technology platform associated with the conference where the paper was presented", + "source_ids": [ + 210 + ] + }, + { + "entity_name": "517 526", + "entity_type": "MEASUREMENT", + "description": "517 526 represents the page range of the paper in the conference proceedings", + "source_ids": [ + 210 + ] + } + ], + "relations": [ + { + "src_entity_name": "taher h haveliwala", + "tgt_entity_name": "topic sensitive pagerank", + "relation_name": "", + "weight": 10.0, + "description": "taher h haveliwala is the author of the paper topic sensitive pagerank", + "source_ids": [ + 210 + ] + }, + { + "src_entity_name": "topic sensitive pagerank", + "tgt_entity_name": "11th international conference on world wide web", + "relation_name": "", + "weight": 9.0, + "description": "the paper topic sensitive pagerank was presented at the 11th international conference on world wide web", + "source_ids": [ + 210 + ] + }, + { + "src_entity_name": "taher h haveliwala", + "tgt_entity_name": "2002", + "relation_name": "", + "weight": 8.0, + "description": "taher h haveliwala published the paper in the year 2002", + "source_ids": [ + 210 + ] + }, + { + "src_entity_name": "11th international conference on world wide web", + "tgt_entity_name": "world wide web", + "relation_name": "", + "weight": 9.0, + "description": "the conference is named after and focused on the world wide web technology", + "source_ids": [ + 210 + ] + }, + { + "src_entity_name": "topic sensitive pagerank", + "tgt_entity_name": "517 526", + "relation_name": "", + "weight": 8.0, + "description": "the paper topic sensitive pagerank spans pages 517 to 526 in the proceedings", + "source_ids": [ + 210 + ] + } + ], + "node_idx": 210 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_211.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_211.json new file mode 100644 index 0000000..d1037ab --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_211.json @@ -0,0 +1,575 @@ +{ + "entities": [ + { + "entity_name": "xiaoxin he", + "entity_type": "PERSON", + "description": "xiaoxin he is listed as one of the authors of the paper", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "yijun tian", + "entity_type": "PERSON", + "description": "yijun tian is listed as one of the authors of the paper", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "yifei sun", + "entity_type": "PERSON", + "description": "yifei sun is listed as one of the authors of the paper", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "nitesh v chawla", + "entity_type": "PERSON", + "description": "nitesh v chawla is listed as one of the authors of the paper", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "thomas laurent", + "entity_type": "PERSON", + "description": "thomas laurent is listed as one of the authors of the paper", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "yann lecun", + "entity_type": "PERSON", + "description": "yann lecun is listed as one of the authors of the paper", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "xavier bresson", + "entity_type": "PERSON", + "description": "xavier bresson is listed as one of the authors of the paper", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "bryan hooi", + "entity_type": "PERSON", + "description": "bryan hooi is listed as one of the authors of the paper", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "g retriever", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "g retriever is a retrieval augmented generation model for textual graph understanding and question answering", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "2024", + "entity_type": "DATE", + "description": "2024 is the year the paper was published and the year associated with the arxiv preprint", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "arxiv 2402 07630", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv 2402 07630 is the identifier for the preprint publication", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv is the platform where the preprint is hosted", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "retrieval augmented generation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "retrieval augmented generation is the technique used by g retriever for textual graph understanding", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "textual graph understanding", + "entity_type": "TASK_OR_PROBLEM", + "description": "textual graph understanding is a specific task addressed by the g retriever model", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "question answering", + "entity_type": "TASK_OR_PROBLEM", + "description": "question answering is a specific task addressed by the g retriever model", + "source_ids": [ + 211 + ] + }, + { + "entity_name": "arxiv preprint", + "entity_type": "FILE_TYPE", + "description": "arxiv preprint is the type of document in which the work was published", + "source_ids": [ + 211 + ] + } + ], + "relations": [ + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "yijun tian", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "yifei sun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "nitesh v chawla", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "thomas laurent", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "yann lecun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "yifei sun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "nitesh v chawla", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "thomas laurent", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "yann lecun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "nitesh v chawla", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "thomas laurent", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "yann lecun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "nitesh v chawla", + "tgt_entity_name": "thomas laurent", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "nitesh v chawla", + "tgt_entity_name": "yann lecun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "nitesh v chawla", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "nitesh v chawla", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "thomas laurent", + "tgt_entity_name": "yann lecun", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "thomas laurent", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "thomas laurent", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yann lecun", + "tgt_entity_name": "xavier bresson", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yann lecun", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "xavier bresson", + "tgt_entity_name": "bryan hooi", + "relation_name": "", + "weight": 9.0, + "description": "listed as co authors on the same paper", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 8.0, + "description": "g retriever was published in the year 2024", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "arxiv 2402 07630", + "relation_name": "", + "weight": 9.0, + "description": "g retriever is identified by the preprint number arxiv 2402 07630", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "g retriever is published on the arxiv platform", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "xiaoxin he", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "xiaoxin he is an author of the paper describing g retriever", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yijun tian", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "yijun tian is an author of the paper describing g retriever", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yifei sun", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "yifei sun is an author of the paper describing g retriever", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "nitesh v chawla", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "nitesh v chawla is an author of the paper describing g retriever", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "thomas laurent", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "thomas laurent is an author of the paper describing g retriever", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "yann lecun", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "yann lecun is an author of the paper describing g retriever", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "xavier bresson", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "xavier bresson is an author of the paper describing g retriever", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "bryan hooi", + "tgt_entity_name": "g retriever", + "relation_name": "", + "weight": 10.0, + "description": "bryan hooi is an author of the paper describing g retriever", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 10.0, + "description": "g retriever utilizes the retrieval augmented generation method", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "textual graph understanding", + "relation_name": "", + "weight": 10.0, + "description": "g retriever is designed to solve the problem of textual graph understanding", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "question answering", + "relation_name": "", + "weight": 10.0, + "description": "g retriever is designed to solve the problem of question answering", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "arxiv 2402 07630", + "tgt_entity_name": "arxiv preprint", + "relation_name": "", + "weight": 9.0, + "description": "arxiv 2402 07630 is an instance of an arxiv preprint", + "source_ids": [ + 211 + ] + }, + { + "src_entity_name": "g retriever", + "tgt_entity_name": "arxiv preprint", + "relation_name": "", + "weight": 9.0, + "description": "g retriever is published as an arxiv preprint", + "source_ids": [ + 211 + ] + } + ], + "node_idx": 211 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_212.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_212.json new file mode 100644 index 0000000..c24ec86 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_212.json @@ -0,0 +1,141 @@ +{ + "entities": [ + { + "entity_name": "yucheng hu", + "entity_type": "PERSON", + "description": "yucheng hu is one of the authors of the 2024 survey on retrieval augmented language models", + "source_ids": [ + 212 + ] + }, + { + "entity_name": "yuxing lu", + "entity_type": "PERSON", + "description": "yuxing lu is one of the authors of the 2024 survey on retrieval augmented language models", + "source_ids": [ + 212 + ] + }, + { + "entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "entity_type": "BOOK", + "description": "rag and rau is the title of a survey paper published in 2024", + "source_ids": [ + 212 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv is the preprint server where the survey paper was published", + "source_ids": [ + 212 + ] + }, + { + "entity_name": "2024", + "entity_type": "DATE", + "description": "2024 is the year the survey paper was published", + "source_ids": [ + 212 + ] + }, + { + "entity_name": "natural language processing", + "entity_type": "RESEARCH_FIELD", + "description": "natural language processing is the field of study addressed by the survey paper", + "source_ids": [ + 212 + ] + }, + { + "entity_name": "arxiv 2404 19543", + "entity_type": "PRODUCT", + "description": "arxiv 2404 19543 is the specific identifier for the preprint paper mentioned in the text", + "source_ids": [ + 212 + ] + }, + { + "entity_name": "retrieval augmented language model", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "retrieval augmented language model is the specific technology subject of the survey", + "source_ids": [ + 212 + ] + } + ], + "relations": [ + { + "src_entity_name": "yucheng hu", + "tgt_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "relation_name": "", + "weight": 10.0, + "description": "yucheng hu is an author of the survey paper", + "source_ids": [ + 212 + ] + }, + { + "src_entity_name": "yuxing lu", + "tgt_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "relation_name": "", + "weight": 10.0, + "description": "yuxing lu is an author of the survey paper", + "source_ids": [ + 212 + ] + }, + { + "src_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper was published on the arxiv preprint server", + "source_ids": [ + 212 + ] + }, + { + "src_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper was published in the year 2024", + "source_ids": [ + 212 + ] + }, + { + "src_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "tgt_entity_name": "natural language processing", + "relation_name": "", + "weight": 8.0, + "description": "the survey paper focuses on the research field of natural language processing", + "source_ids": [ + 212 + ] + }, + { + "src_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "tgt_entity_name": "arxiv 2404 19543", + "relation_name": "", + "weight": 10.0, + "description": "the survey paper is identified by the preprint number arxiv 2404 19543", + "source_ids": [ + 212 + ] + }, + { + "src_entity_name": "rag and rau a survey on retrieval augmented language model in natural language processing", + "tgt_entity_name": "retrieval augmented language model", + "relation_name": "", + "weight": 9.0, + "description": "the survey paper is about the retrieval augmented language model technology", + "source_ids": [ + 212 + ] + } + ], + "node_idx": 212 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_213.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_213.json new file mode 100644 index 0000000..db0ba35 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_213.json @@ -0,0 +1,187 @@ +{ + "entities": [ + { + "entity_name": "soyeong jeong", + "entity_type": "PERSON", + "description": "soyeong jeong is an author of the 2024 arxiv preprint titled adaptive rag", + "source_ids": [ + 213 + ] + }, + { + "entity_name": "jinheon baek", + "entity_type": "PERSON", + "description": "jinheon baek is an author of the 2024 arxiv preprint titled adaptive rag", + "source_ids": [ + 213 + ] + }, + { + "entity_name": "adaptive rag", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "adaptive rag is a model described as learning to adapt retrieval augmented large language models through question complexity", + "source_ids": [ + 213 + ] + }, + { + "entity_name": "retrieval augmented large language models", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "retrieval augmented large language models are the subject of adaptation in the adaptive rag study", + "source_ids": [ + 213 + ] + }, + { + "entity_name": "arxiv", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv is the venue where the preprint arxiv 2403 14403 was published", + "source_ids": [ + 213 + ] + }, + { + "entity_name": "2024", + "entity_type": "DATE", + "description": "2024 is the year the preprint was published and the year associated with the authors work", + "source_ids": [ + 213 + ] + }, + { + "entity_name": "arxiv 2403 14403", + "entity_type": "PUBLICATION_VENUE", + "description": "arxiv 2403 14403 is the specific identifier for the preprint document", + "source_ids": [ + 213 + ] + }, + { + "entity_name": "et al", + "entity_type": "PERSON", + "description": "et al refers to additional authors of the paper not explicitly named in the text", + "source_ids": [ + 213 + ] + }, + { + "entity_name": "question complexity", + "entity_type": "TASK_OR_PROBLEM", + "description": "question complexity is the factor through which adaptive rag learns to adapt models", + "source_ids": [ + 213 + ] + }, + { + "entity_name": "learning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "learning is the process by which adaptive rag adapts to question complexity", + "source_ids": [ + 213 + ] + } + ], + "relations": [ + { + "src_entity_name": "soyeong jeong", + "tgt_entity_name": "adaptive rag", + "relation_name": "", + "weight": 9.0, + "description": "soyeong jeong is an author of the work on adaptive rag", + "source_ids": [ + 213 + ] + }, + { + "src_entity_name": "jinheon baek", + "tgt_entity_name": "adaptive rag", + "relation_name": "", + "weight": 9.0, + "description": "jinheon baek is an author of the work on adaptive rag", + "source_ids": [ + 213 + ] + }, + { + "src_entity_name": "adaptive rag", + "tgt_entity_name": "retrieval augmented large language models", + "relation_name": "", + "weight": 10.0, + "description": "adaptive rag is designed to adapt retrieval augmented large language models", + "source_ids": [ + 213 + ] + }, + { + "src_entity_name": "soyeong jeong", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "soyeong jeong s work was published on arxiv", + "source_ids": [ + 213 + ] + }, + { + "src_entity_name": "jinheon baek", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 8.0, + "description": "jinheon baek s work was published on arxiv", + "source_ids": [ + 213 + ] + }, + { + "src_entity_name": "adaptive rag", + "tgt_entity_name": "arxiv", + "relation_name": "", + "weight": 9.0, + "description": "the adaptive rag preprint is hosted on arxiv", + "source_ids": [ + 213 + ] + }, + { + "src_entity_name": "soyeong jeong", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 9.0, + "description": "soyeong jeong is listed alongside et al as authors of the paper", + "source_ids": [ + 213 + ] + }, + { + "src_entity_name": "jinheon baek", + "tgt_entity_name": "et al", + "relation_name": "", + "weight": 9.0, + "description": "jinheon baek is listed alongside et al as authors of the paper", + "source_ids": [ + 213 + ] + }, + { + "src_entity_name": "adaptive rag", + "tgt_entity_name": "question complexity", + "relation_name": "", + "weight": 10.0, + "description": "adaptive rag adapts specifically through the lens of question complexity", + "source_ids": [ + 213 + ] + }, + { + "src_entity_name": "adaptive rag", + "tgt_entity_name": "learning", + "relation_name": "", + "weight": 8.0, + "description": "adaptive rag utilizes learning to adapt its retrieval mechanisms", + "source_ids": [ + 213 + ] + } + ], + "node_idx": 213 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_214.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_214.json new file mode 100644 index 0000000..298bd8f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_214.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "13", + "entity_type": "NUMBER", + "description": "13 is a number mentioned in the text though its specific context or role is not defined", + "source_ids": [ + 214 + ] + } + ], + "relations": [], + "node_idx": 214 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_215.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_215.json new file mode 100644 index 0000000..9dea28c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_215.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "table: node 215...", + "entity_type": "TABLE", + "description": "A table with no available description.", + "source_ids": [ + 215 + ] + } + ], + "relations": [], + "node_idx": 215 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_216.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_216.json new file mode 100644 index 0000000..be743f5 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_216.json @@ -0,0 +1,537 @@ +{ + "entities": [ + { + "entity_name": "timo schick", + "entity_type": "PERSON", + "description": "timo schick is listed as one of the authors of the document", + "source_ids": [ + 216 + ] + }, + { + "entity_name": "jane dwivedi yu", + "entity_type": "PERSON", + "description": "jane dwivedi yu is listed as one of the authors of the document", + "source_ids": [ + 216 + ] + }, + { + "entity_name": "roberto dess", + "entity_type": "PERSON", + "description": "roberto dess is listed as one of the authors of the document", + "source_ids": [ + 216 + ] + }, + { + "entity_name": "roberta raileanu", + "entity_type": "PERSON", + "description": "roberta raileanu is listed as one of the authors of the document", + "source_ids": [ + 216 + ] + }, + { + "entity_name": "maria lomeli", + "entity_type": "PERSON", + "description": "maria lomeli is listed as one of the authors of the document", + "source_ids": [ + 216 + ] + }, + { + "entity_name": "eric hambro", + "entity_type": "PERSON", + "description": "eric hambro is listed as one of the authors of the document", + "source_ids": [ + 216 + ] + }, + { + "entity_name": "luke zettlemoyer", + "entity_type": "PERSON", + "description": "luke zettlemoyer is listed as one of the authors of the document", + "source_ids": [ + 216 + ] + }, + { + "entity_name": "nicola cancedda", + "entity_type": "PERSON", + "description": "nicola cancedda is listed as one of the authors of the document", + "source_ids": [ + 216 + ] + }, + { + "entity_name": "thomas scialom", + "entity_type": "PERSON", + "description": "thomas scialom is listed as one of the authors of the document", + "source_ids": [ + 216 + ] + }, + { + "entity_name": "2024", + "entity_type": "DATE", + "description": "2024 is the year associated with the publication or work by the listed authors", + "source_ids": [ + 216 + ] + } + ], + "relations": [ + { + "src_entity_name": "timo schick", + "tgt_entity_name": "jane dwivedi yu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "roberto dess", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "roberta raileanu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "maria lomeli", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "eric hambro", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "roberto dess", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "roberta raileanu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "maria lomeli", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "eric hambro", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "roberta raileanu", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "maria lomeli", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "eric hambro", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "maria lomeli", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "eric hambro", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "maria lomeli", + "tgt_entity_name": "eric hambro", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "maria lomeli", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "maria lomeli", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "maria lomeli", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "eric hambro", + "tgt_entity_name": "luke zettlemoyer", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "eric hambro", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "eric hambro", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "luke zettlemoyer", + "tgt_entity_name": "nicola cancedda", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "luke zettlemoyer", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "nicola cancedda", + "tgt_entity_name": "thomas scialom", + "relation_name": "", + "weight": 8.0, + "description": "listed as co authors on the same document", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "timo schick", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "timo schick is an author of the work published in 2024", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "jane dwivedi yu", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "jane dwivedi yu is an author of the work published in 2024", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberto dess", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "roberto dess is an author of the work published in 2024", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "roberta raileanu", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "roberta raileanu is an author of the work published in 2024", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "maria lomeli", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "maria lomeli is an author of the work published in 2024", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "eric hambro", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "eric hambro is an author of the work published in 2024", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "luke zettlemoyer", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "luke zettlemoyer is an author of the work published in 2024", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "nicola cancedda", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "nicola cancedda is an author of the work published in 2024", + "source_ids": [ + 216 + ] + }, + { + "src_entity_name": "thomas scialom", + "tgt_entity_name": "2024", + "relation_name": "", + "weight": 9.0, + "description": "thomas scialom is an author of the work published in 2024", + "source_ids": [ + 216 + ] + } + ], + "node_idx": 216 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_217.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_217.json new file mode 100644 index 0000000..ce64aa5 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_217.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "table: node 217...", + "entity_type": "TABLE", + "description": "A table with no available description.", + "source_ids": [ + 217 + ] + } + ], + "relations": [], + "node_idx": 217 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_218.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_218.json new file mode 100644 index 0000000..4885642 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_218.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "table: node 218...", + "entity_type": "TABLE", + "description": "A table with no available description.", + "source_ids": [ + 218 + ] + } + ], + "relations": [], + "node_idx": 218 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_219.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_219.json new file mode 100644 index 0000000..4319f0f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_219.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "14", + "entity_type": "MEASUREMENT", + "description": "14 is a numerical value mentioned in the text potentially representing a measurement or count", + "source_ids": [ + 219 + ] + } + ], + "relations": [], + "node_idx": 219 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_22.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_22.json new file mode 100644 index 0000000..56c42a1 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_22.json @@ -0,0 +1,207 @@ +{ + "entities": [ + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "bookindex is a system or component that serves as the foundation for the described agent based retrieval approach", + "source_ids": [ + 22 + ] + }, + { + "entity_name": "information foraging theory", + "entity_type": "SCIENTIFIC_THEORY", + "description": "information foraging theory is the theoretical framework grounding the retrieval process described in the text", + "source_ids": [ + 22 + ] + }, + { + "entity_name": "selector", + "entity_type": "SOFTWARE", + "description": "selector is a component used to narrow down the search space via information scents", + "source_ids": [ + 22 + ] + }, + { + "entity_name": "reasoner", + "entity_type": "SOFTWARE", + "description": "reasoner is a component used to locate highly relevant evidence", + "source_ids": [ + 22 + ] + }, + { + "entity_name": "user queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "user queries are the input items that the agent classifies based on intent and complexity", + "source_ids": [ + 22 + ] + }, + { + "entity_name": "retrieval workflows", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval workflows are the static processes being addressed and dynamically generated by the agent", + "source_ids": [ + 22 + ] + }, + { + "entity_name": "search space", + "entity_type": "TASK_OR_PROBLEM", + "description": "the search space is the area narrowed down by the selector component", + "source_ids": [ + 22 + ] + }, + { + "entity_name": "information scents", + "entity_type": "CONCEPT", + "description": "information scents are the signals used by the selector to narrow down the search space", + "source_ids": [ + 22 + ] + }, + { + "entity_name": "evidence", + "entity_type": "CONCEPT", + "description": "evidence refers to the highly relevant information located by the reasoner", + "source_ids": [ + 22 + ] + }, + { + "entity_name": "agent", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 22 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookindex", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 8.0, + "description": "the system builds upon bookindex to implement an agent that uses selector for retrieval workflows", + "source_ids": [ + 22 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 8.0, + "description": "the system builds upon bookindex to implement an agent that uses reasoner for retrieval workflows", + "source_ids": [ + 22 + ] + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 9.0, + "description": "the retrieval process using selector is grounded in information foraging theory", + "source_ids": [ + 22 + ] + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 9.0, + "description": "the retrieval process using reasoner is grounded in information foraging theory", + "source_ids": [ + 22 + ] + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 9.0, + "description": "selector and reasoner work together within the agent based retrieval process to narrow search space and locate evidence", + "source_ids": [ + 22 + ] + }, + { + "src_entity_name": "user queries", + "tgt_entity_name": "retrieval workflows", + "relation_name": "", + "weight": 9.0, + "description": "user queries are classified to dynamically generate tailored retrieval workflows", + "source_ids": [ + 22 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "user queries", + "relation_name": "", + "weight": 10.0, + "description": "the agent classifies user queries based on their intent and complexity", + "source_ids": [ + 22 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "retrieval workflows", + "relation_name": "", + "weight": 10.0, + "description": "the agent dynamically generates tailored retrieval workflows", + "source_ids": [ + 22 + ] + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "search space", + "relation_name": "", + "weight": 9.0, + "description": "the selector narrows down the search space", + "source_ids": [ + 22 + ] + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "information scents", + "relation_name": "", + "weight": 9.0, + "description": "the selector uses information scents to narrow down the search space", + "source_ids": [ + 22 + ] + }, + { + "src_entity_name": "reasoner", + "tgt_entity_name": "evidence", + "relation_name": "", + "weight": 9.0, + "description": "the reasoner locates highly relevant evidence", + "source_ids": [ + 22 + ] + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "retrieval workflows", + "relation_name": "", + "weight": 8.0, + "description": "the retrieval process mimics foraging as described by information foraging theory", + "source_ids": [ + 22 + ] + } + ], + "node_idx": 22 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_220.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_220.json new file mode 100644 index 0000000..d1487f9 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_220.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "a experimental details", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the main paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section provides the specific configuration, setup, and parameters used to conduct the experiments described in the study.", + "source_ids": [ + 220 + ] + } + ], + "relations": [], + "node_idx": 220 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_221.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_221.json new file mode 100644 index 0000000..f4274b4 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_221.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "a.1 evaluation metrics", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experimental Details' within the paper 'BookRAG', this section defines the specific quantitative measures used to assess the performance of the retrieval-augmented generation system.", + "source_ids": [ + 221 + ] + }, + { + "entity_name": "accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "A primary metric discussed in section A.1, defined as the proportion of cases where the set of named entities in the model's response is a subset of those in the ground truth.", + "source_ids": [ + 221 + ] + } + ], + "relations": [ + { + "src_entity_name": "accuracy", + "tgt_entity_name": "a.1 evaluation metrics", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Accuracy' is a specific evaluation metric detailed as a topic within section A.1.", + "source_ids": [ + 221 + ] + } + ], + "node_idx": 221 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_222.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_222.json new file mode 100644 index 0000000..d84e405 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_222.json @@ -0,0 +1,89 @@ +{ + "entities": [ + { + "entity_name": "main experiments", + "entity_type": "EVENT", + "description": "main experiments are the primary experiments for which metrics are defined and calculated in the text", + "source_ids": [ + 222 + ] + }, + { + "entity_name": "metrics", + "entity_type": "EVALUATION_METRIC", + "description": "metrics are the specific measures defined and calculated in the text for the main experiments", + "source_ids": [ + 222 + ] + }, + { + "entity_name": "definitions", + "entity_type": "CONCEPT", + "description": "definitions are the detailed descriptions provided for the metrics in the text", + "source_ids": [ + 222 + ] + }, + { + "entity_name": "calculation procedures", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "calculation procedures are the step by step methods described for computing the metrics", + "source_ids": [ + 222 + ] + } + ], + "relations": [ + { + "src_entity_name": "main experiments", + "tgt_entity_name": "main experiments", + "relation_name": "", + "weight": 1.0, + "description": "the text refers to the metrics used in the main experiments implying a self referential context for the definitions", + "source_ids": [ + 222 + ] + }, + { + "src_entity_name": "metrics", + "tgt_entity_name": "main experiments", + "relation_name": "", + "weight": 10.0, + "description": "metrics are explicitly stated to be used in the main experiments", + "source_ids": [ + 222 + ] + }, + { + "src_entity_name": "definitions", + "tgt_entity_name": "metrics", + "relation_name": "", + "weight": 9.0, + "description": "definitions are provided for the metrics", + "source_ids": [ + 222 + ] + }, + { + "src_entity_name": "calculation procedures", + "tgt_entity_name": "metrics", + "relation_name": "", + "weight": 9.0, + "description": "calculation procedures are provided for the metrics", + "source_ids": [ + 222 + ] + }, + { + "src_entity_name": "definitions", + "tgt_entity_name": "calculation procedures", + "relation_name": "", + "weight": 8.0, + "description": "both definitions and calculation procedures are provided together for the metrics in the text", + "source_ids": [ + 222 + ] + } + ], + "node_idx": 222 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_223.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_223.json new file mode 100644 index 0000000..851d8da --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_223.json @@ -0,0 +1,133 @@ +{ + "entities": [ + { + "entity_name": "standard rag models", + "entity_type": "TECHNOLOGY", + "description": "standard rag models are described as systems that generate free form natural language responses", + "source_ids": [ + 223 + ] + }, + { + "entity_name": "natural language responses", + "entity_type": "PRODUCT", + "description": "natural language responses are the output generated by standard rag models often containing extraneous conversational text", + "source_ids": [ + 223 + ] + }, + { + "entity_name": "ground truth labels", + "entity_type": "PRODUCT", + "description": "ground truth labels are concise reference answers e g option a or 12 5 used for comparison against model outputs", + "source_ids": [ + 223 + ] + }, + { + "entity_name": "a 1 1 answer extraction and normalization", + "entity_type": "SECTION_TITLE", + "description": "a 1 1 answer extraction and normalization is the title of the section discussing the process of extracting and normalizing answers", + "source_ids": [ + 223 + ] + }, + { + "entity_name": "option a", + "entity_type": "PRODUCT", + "description": "option a is an example of a concise ground truth label mentioned in the text", + "source_ids": [ + 223 + ] + }, + { + "entity_name": "12 5", + "entity_type": "MEASUREMENT", + "description": "12 5 is an example of a concise ground truth label mentioned in the text", + "source_ids": [ + 223 + ] + }, + { + "entity_name": "the answer is", + "entity_type": "PRODUCT", + "description": "the answer is is an example of extraneous conversational text that may appear in raw model outputs", + "source_ids": [ + 223 + ] + } + ], + "relations": [ + { + "src_entity_name": "standard rag models", + "tgt_entity_name": "natural language responses", + "relation_name": "", + "weight": 10.0, + "description": "standard rag models generate natural language responses as their output", + "source_ids": [ + 223 + ] + }, + { + "src_entity_name": "natural language responses", + "tgt_entity_name": "ground truth labels", + "relation_name": "", + "weight": 8.0, + "description": "natural language responses are compared against ground truth labels a process that can lead to false negatives if not normalized", + "source_ids": [ + 223 + ] + }, + { + "src_entity_name": "a 1 1 answer extraction and normalization", + "tgt_entity_name": "standard rag models", + "relation_name": "", + "weight": 9.0, + "description": "the section a 1 1 answer extraction and normalization describes the behavior of standard rag models", + "source_ids": [ + 223 + ] + }, + { + "src_entity_name": "a 1 1 answer extraction and normalization", + "tgt_entity_name": "ground truth labels", + "relation_name": "", + "weight": 9.0, + "description": "the section a 1 1 answer extraction and normalization discusses the comparison with ground truth labels", + "source_ids": [ + 223 + ] + }, + { + "src_entity_name": "ground truth labels", + "tgt_entity_name": "option a", + "relation_name": "", + "weight": 10.0, + "description": "option a is cited as an example of a ground truth label", + "source_ids": [ + 223 + ] + }, + { + "src_entity_name": "ground truth labels", + "tgt_entity_name": "12 5", + "relation_name": "", + "weight": 10.0, + "description": "12 5 is cited as an example of a ground truth label", + "source_ids": [ + 223 + ] + }, + { + "src_entity_name": "natural language responses", + "tgt_entity_name": "the answer is", + "relation_name": "", + "weight": 10.0, + "description": "the answer is is cited as an example of the extraneous conversational text found in natural language responses", + "source_ids": [ + 223 + ] + } + ], + "node_idx": 223 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_224.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_224.json new file mode 100644 index 0000000..4f77ed2 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_224.json @@ -0,0 +1,285 @@ +{ + "entities": [ + { + "entity_name": "llm based extraction step", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "llm based extraction step is a method used to align model output with the ground truth format before calculation", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "rag system", + "entity_type": "SYSTEM", + "description": "rag system is the system that generates the raw response denoted as y raw", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "llmextract", + "entity_type": "SOFTWARE", + "description": "llmextract is a component or function that extracts key information from the raw response", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "y raw", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "y raw denotes the raw response generated by the rag system", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "y gold", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "y gold denotes the ground truth", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "y hat", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "y hat denotes the extracted answer", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "n", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "n is a standard normalization function applied to y hat and y gold", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "equation 16", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 16 defines the relationship between the extracted answer the raw response and the instruction", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "official evaluation protocols", + "entity_type": "TASK_OR_PROBLEM", + "description": "official evaluation protocols are the standards followed to ensure the extraction step aligns with the ground truth format", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "ground truth", + "entity_type": "CONCEPT", + "description": "ground truth refers to the correct or expected answer used as a benchmark for evaluation", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "key information", + "entity_type": "CONCEPT", + "description": "key information refers to the essential data such as key entities for span extraction that llmextract retrieves", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "key entity", + "entity_type": "CONCEPT", + "description": "key entity is an example of the key information extracted for span extraction", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "span extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "span extraction is a specific task mentioned as an example of where key entities are extracted", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "lowercasing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "lowercasing is a standard normalization technique applied to the text", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "removing punctuation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "removing punctuation is a standard normalization technique applied to the text", + "source_ids": [ + 224 + ] + }, + { + "entity_name": "instruction", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "instruction is a parameter provided to the llmextract function to guide the extraction process", + "source_ids": [ + 224 + ] + } + ], + "relations": [ + { + "src_entity_name": "llm based extraction step", + "tgt_entity_name": "rag system", + "relation_name": "", + "weight": 9.0, + "description": "the llm based extraction step is employed to process the output from the rag system", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "y raw", + "relation_name": "", + "weight": 10.0, + "description": "llmextract extracts key information from y raw", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "y hat", + "relation_name": "", + "weight": 10.0, + "description": "llmextract is used to define the extracted answer y hat", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "n", + "tgt_entity_name": "y hat", + "relation_name": "", + "weight": 9.0, + "description": "n is applied to normalize y hat", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "n", + "tgt_entity_name": "y gold", + "relation_name": "", + "weight": 9.0, + "description": "n is applied to normalize y gold", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "y raw", + "tgt_entity_name": "y gold", + "relation_name": "", + "weight": 8.0, + "description": "y raw and y gold are compared after normalization to calculate the evaluation metric", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "equation 16", + "tgt_entity_name": "llmextract", + "relation_name": "", + "weight": 9.0, + "description": "equation 16 utilizes llmextract to define the extracted answer", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "official evaluation protocols", + "tgt_entity_name": "llm based extraction step", + "relation_name": "", + "weight": 9.0, + "description": "the llm based extraction step is employed following official evaluation protocols", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "llm based extraction step", + "tgt_entity_name": "ground truth", + "relation_name": "", + "weight": 9.0, + "description": "the llm based extraction step aligns the model output with the ground truth format", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "key information", + "relation_name": "", + "weight": 10.0, + "description": "llmextract is responsible for extracting key information from the raw response", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "key entity", + "relation_name": "", + "weight": 8.0, + "description": "key entity is a specific type of key information extracted by llmextract", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "span extraction", + "relation_name": "", + "weight": 8.0, + "description": "span extraction is the context in which llmextract extracts key entities", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "n", + "tgt_entity_name": "lowercasing", + "relation_name": "", + "weight": 9.0, + "description": "lowercasing is an example of the standard normalization n applied to the data", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "n", + "tgt_entity_name": "removing punctuation", + "relation_name": "", + "weight": 9.0, + "description": "removing punctuation is an example of the standard normalization n applied to the data", + "source_ids": [ + 224 + ] + }, + { + "src_entity_name": "llmextract", + "tgt_entity_name": "instruction", + "relation_name": "", + "weight": 10.0, + "description": "llmextract uses the instruction parameter to perform the extraction", + "source_ids": [ + 224 + ] + } + ], + "node_idx": 224 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_225.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_225.json new file mode 100644 index 0000000..d9c3fd4 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_225.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (16)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the predicted output y_hat as a function of raw input and instruction. LaTeX: ˆ 𝑦 = LLMextract ( 𝑦 𝑟𝑎𝑤 , Instruction ) (16)", + "source_ids": [ + 225 + ] + } + ], + "relations": [], + "node_idx": 225 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_226.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_226.json new file mode 100644 index 0000000..349485e --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_226.json @@ -0,0 +1,105 @@ +{ + "entities": [ + { + "entity_name": "a.1.2 qa performance metrics", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experimental Details' within the BookRAG paper, this section defines the specific metrics used to evaluate Question Answering performance, detailing the calculation of Accuracy based on substring inclusion between ground truth and model responses.", + "source_ids": [ + 226 + ] + }, + { + "entity_name": "qa performance metrics", + "entity_type": "EVALUATION_METRIC", + "description": "Refers to the set of quantitative measures defined in section A.1.2 for assessing the quality of answers generated by the model.", + "source_ids": [ + 226 + ] + }, + { + "entity_name": "accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "A specific metric defined in section A.1.2 that calculates correctness based on whether the normalized ground truth is a substring of the normalized raw response.", + "source_ids": [ + 226 + ] + }, + { + "entity_name": "ground truth (y_gold)", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The reference answer or expected output used as the baseline for calculating accuracy in section A.1.2.", + "source_ids": [ + 226 + ] + }, + { + "entity_name": "model response (y_raw)", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The raw output generated by the model, which is compared against the ground truth in section A.1.2.", + "source_ids": [ + 226 + ] + }, + { + "entity_name": "substring inclusion relation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The logical operation (denoted by ⊆) used in section A.1.2 to determine if one text sequence is contained within another for the purpose of evaluation.", + "source_ids": [ + 226 + ] + } + ], + "relations": [ + { + "src_entity_name": "qa performance metrics", + "tgt_entity_name": "a.1.2 qa performance metrics", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'QA Performance Metrics' is the primary topic covered in section A.1.2.", + "source_ids": [ + 226 + ] + }, + { + "src_entity_name": "accuracy", + "tgt_entity_name": "a.1.2 qa performance metrics", + "relation_name": "", + "weight": 10.0, + "description": "The metric 'Accuracy' is explicitly defined and detailed within section A.1.2.", + "source_ids": [ + 226 + ] + }, + { + "src_entity_name": "ground truth (y_gold)", + "tgt_entity_name": "a.1.2 qa performance metrics", + "relation_name": "", + "weight": 9.5, + "description": "The variable 'Ground Truth' is a fundamental component used in the definitions provided in section A.1.2.", + "source_ids": [ + 226 + ] + }, + { + "src_entity_name": "model response (y_raw)", + "tgt_entity_name": "a.1.2 qa performance metrics", + "relation_name": "", + "weight": 9.5, + "description": "The variable 'Model Response' is a fundamental component used in the definitions provided in section A.1.2.", + "source_ids": [ + 226 + ] + }, + { + "src_entity_name": "substring inclusion relation", + "tgt_entity_name": "a.1.2 qa performance metrics", + "relation_name": "", + "weight": 9.0, + "description": "The technique 'Substring Inclusion Relation' is the core logic applied in section A.1.2 to compute the metrics.", + "source_ids": [ + 226 + ] + } + ], + "node_idx": 226 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_227.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_227.json new file mode 100644 index 0000000..fff2ceb --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_227.json @@ -0,0 +1,107 @@ +{ + "entities": [ + { + "entity_name": "accuracy inclusion based", + "entity_type": "EVALUATION_METRIC", + "description": "accuracy inclusion based is a soft match metric used to evaluate model predictions by checking if the normalized gold answer is included in the generated response", + "source_ids": [ + 227 + ] + }, + { + "entity_name": "llm", + "entity_type": "TECHNOLOGY", + "description": "llm refers to large language models the type of technology whose generation nature is described as uncontrollable in the text", + "source_ids": [ + 227 + ] + }, + { + "entity_name": "prior works", + "entity_type": "PUBLICATION_VENUE", + "description": "prior works refer to previous research studies cited in the text as a basis for the methodology", + "source_ids": [ + 227 + ] + }, + { + "entity_name": "3", + "entity_type": "PUBLICATION_VENUE", + "description": "3 is a citation number referring to a specific prior work mentioned in the text", + "source_ids": [ + 227 + ] + }, + { + "entity_name": "34", + "entity_type": "PUBLICATION_VENUE", + "description": "34 is a citation number referring to a specific prior work mentioned in the text", + "source_ids": [ + 227 + ] + }, + { + "entity_name": "46", + "entity_type": "PUBLICATION_VENUE", + "description": "46 is a citation number referring to a specific prior work mentioned in the text", + "source_ids": [ + 227 + ] + }, + { + "entity_name": "soft match metric", + "entity_type": "EVALUATION_METRIC", + "description": "soft match metric is a category of evaluation methods described as being used in the text", + "source_ids": [ + 227 + ] + }, + { + "entity_name": "normalized gold answer", + "entity_type": "DATASET_OR_CORPUS", + "description": "normalized gold answer is the reference data used to determine if a prediction is correct", + "source_ids": [ + 227 + ] + }, + { + "entity_name": "model s generated response", + "entity_type": "PRODUCT", + "description": "model s generated response is the output produced by the model being evaluated", + "source_ids": [ + 227 + ] + }, + { + "entity_name": "strict exact match", + "entity_type": "EVALUATION_METRIC", + "description": "strict exact match is a comparison method explicitly contrasted with the soft match metric in the text", + "source_ids": [ + 227 + ] + } + ], + "relations": [ + { + "src_entity_name": "accuracy inclusion based", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "accuracy inclusion based is utilized to account for the uncontrollable nature of llm generation", + "source_ids": [ + 227 + ] + }, + { + "src_entity_name": "accuracy inclusion based", + "tgt_entity_name": "prior works", + "relation_name": "", + "weight": 9.0, + "description": "accuracy inclusion based is utilized following prior works cited as 3 34 46", + "source_ids": [ + 227 + ] + } + ], + "node_idx": 227 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_228.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_228.json new file mode 100644 index 0000000..9ee5a17 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_228.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (17)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the Accuracy metric as the average of an indicator function comparing neighborhood sets. LaTeX: Accuracy = 1 𝑁 𝑁 ∑︁ 𝑖 = 1 I (N( 𝑦 𝑔𝑜𝑙𝑑,𝑖 ) ⊆ N( 𝑦 𝑟𝑎𝑤,𝑖 )) (17)", + "source_ids": [ + 228 + ] + } + ], + "relations": [], + "node_idx": 228 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_229.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_229.json new file mode 100644 index 0000000..2dc3ec7 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_229.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "exact match", + "entity_type": "EVALUATION_METRIC", + "description": "exact match is a strict metric that measures whether the normalized extracted answer is character for character identical to the ground truth", + "source_ids": [ + 229 + ] + }, + { + "entity_name": "accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "accuracy is mentioned as a metric that is contrasted with exact match implying it is less strict", + "source_ids": [ + 229 + ] + } + ], + "relations": [ + { + "src_entity_name": "exact match", + "tgt_entity_name": "accuracy", + "relation_name": "", + "weight": 9.0, + "description": "exact match is described as being stricter than accuracy", + "source_ids": [ + 229 + ] + } + ], + "node_idx": 229 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_23.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_23.json new file mode 100644 index 0000000..ba307da --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_23.json @@ -0,0 +1,183 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a system or product being evaluated for its effectiveness and efficiency in retrieval and qa tasks", + "source_ids": [ + 23 + ] + }, + { + "entity_name": "kg", + "entity_type": "PRODUCT", + "description": "kg refers to a high quality knowledge graph identified as a key feature contributing to the system s performance", + "source_ids": [ + 23 + ] + }, + { + "entity_name": "agent based retrieval mechanism", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the agent based retrieval mechanism is a key feature of the system validated for its critical contributions", + "source_ids": [ + 23 + ] + }, + { + "entity_name": "retrieval recall", + "entity_type": "EVALUATION_METRIC", + "description": "retrieval recall is a metric used to measure the performance of the system", + "source_ids": [ + 23 + ] + }, + { + "entity_name": "qa accuracy", + "entity_type": "EVALUATION_METRIC", + "description": "qa accuracy is a metric used to measure the performance of the system", + "source_ids": [ + 23 + ] + }, + { + "entity_name": "three widely adopted datasets", + "entity_type": "DATASET_OR_CORPUS", + "description": "three widely adopted datasets are the data sources used to conduct extensive experiments and validate the system", + "source_ids": [ + 23 + ] + }, + { + "entity_name": "state of the art baselines", + "entity_type": "PRODUCT", + "description": "state of the art baselines are the existing systems against which bookrag is compared in the experiments", + "source_ids": [ + 23 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes a high quality kg as a key feature contributing to its performance", + "source_ids": [ + 23 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based retrieval mechanism", + "relation_name": "", + "weight": 9.0, + "description": "bookrag employs an agent based retrieval mechanism as a key feature contributing to its performance", + "source_ids": [ + 23 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 10.0, + "description": "bookrag achieves superior performance in retrieval recall as demonstrated by experimental results", + "source_ids": [ + 23 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "qa accuracy", + "relation_name": "", + "weight": 10.0, + "description": "bookrag achieves superior performance in qa accuracy as demonstrated by experimental results", + "source_ids": [ + 23 + ] + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 7.0, + "description": "the high quality kg is a feature that contributes to the performance in retrieval recall", + "source_ids": [ + 23 + ] + }, + { + "src_entity_name": "agent based retrieval mechanism", + "tgt_entity_name": "qa accuracy", + "relation_name": "", + "weight": 7.0, + "description": "the agent based retrieval mechanism is a feature that contributes to the performance in qa accuracy", + "source_ids": [ + 23 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "three widely adopted datasets", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is extensively experimented upon using three widely adopted datasets to validate its effectiveness", + "source_ids": [ + 23 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "state of the art baselines", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is compared against several state of the art baselines to demonstrate its superior performance", + "source_ids": [ + 23 + ] + }, + { + "src_entity_name": "three widely adopted datasets", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 8.0, + "description": "the three widely adopted datasets are used to measure the retrieval recall performance of the system", + "source_ids": [ + 23 + ] + }, + { + "src_entity_name": "three widely adopted datasets", + "tgt_entity_name": "qa accuracy", + "relation_name": "", + "weight": 8.0, + "description": "the three widely adopted datasets are used to measure the qa accuracy performance of the system", + "source_ids": [ + 23 + ] + }, + { + "src_entity_name": "state of the art baselines", + "tgt_entity_name": "retrieval recall", + "relation_name": "", + "weight": 7.0, + "description": "state of the art baselines are evaluated on retrieval recall to compare against bookrag", + "source_ids": [ + 23 + ] + }, + { + "src_entity_name": "state of the art baselines", + "tgt_entity_name": "qa accuracy", + "relation_name": "", + "weight": 7.0, + "description": "state of the art baselines are evaluated on qa accuracy to compare against bookrag", + "source_ids": [ + 23 + ] + } + ], + "node_idx": 23 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_230.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_230.json new file mode 100644 index 0000000..3836b28 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_230.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (18)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the Error Metric (EM) as the average of indicator functions comparing predicted and ground truth labels. LaTeX: EM = 1 𝑁 𝑁 ∑︁ 𝑖 = 1 I (N( ˆ 𝑦 𝑖 ) = N( 𝑦 𝑔𝑜𝑙𝑑,𝑖 )) (18)", + "source_ids": [ + 230 + ] + } + ], + "relations": [], + "node_idx": 230 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_231.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_231.json new file mode 100644 index 0000000..9cc1b9e --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_231.json @@ -0,0 +1,95 @@ +{ + "entities": [ + { + "entity_name": "f1 score", + "entity_type": "EVALUATION_METRIC", + "description": "f1 score is an evaluation metric used to measure the performance of text span answers by comparing extracted answers to ground truth", + "source_ids": [ + 231 + ] + }, + { + "entity_name": "token level f1 score", + "entity_type": "EVALUATION_METRIC", + "description": "token level f1 score is a specific type of f1 score used for questions requiring text span answers", + "source_ids": [ + 231 + ] + }, + { + "entity_name": "p", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "p represents precision calculated as the intersection of extracted and ground truth tokens divided by the extracted tokens", + "source_ids": [ + 231 + ] + }, + { + "entity_name": "r", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "r represents recall calculated as the intersection of extracted and ground truth tokens divided by the ground truth tokens", + "source_ids": [ + 231 + ] + }, + { + "entity_name": "f1", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "f1 is the harmonic mean of precision p and recall r calculated using the formula 2 p r p r", + "source_ids": [ + 231 + ] + }, + { + "entity_name": "equation 19", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 19 defines the calculation for the f1 score based on precision and recall", + "source_ids": [ + 231 + ] + } + ], + "relations": [ + { + "src_entity_name": "f1 score", + "tgt_entity_name": "token level f1 score", + "relation_name": "", + "weight": 9.0, + "description": "the token level f1 score is a specific application of the f1 score for text span answers", + "source_ids": [ + 231 + ] + }, + { + "src_entity_name": "f1 score", + "tgt_entity_name": "equation 19", + "relation_name": "", + "weight": 10.0, + "description": "equation 19 provides the mathematical formula for calculating the f1 score", + "source_ids": [ + 231 + ] + }, + { + "src_entity_name": "p", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 10.0, + "description": "p precision is a component used in the calculation of the f1 score", + "source_ids": [ + 231 + ] + }, + { + "src_entity_name": "r", + "tgt_entity_name": "f1", + "relation_name": "", + "weight": 10.0, + "description": "r recall is a component used in the calculation of the f1 score", + "source_ids": [ + 231 + ] + } + ], + "node_idx": 231 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_232.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_232.json new file mode 100644 index 0000000..2b1d256 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_232.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (19)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining Precision, Recall, and F1 score metrics using set intersections. LaTeX: 𝑃 = | 𝑇 ˆ 𝑦 ∩ 𝑇 𝑔𝑜𝑙𝑑 | | 𝑇 ˆ 𝑦 | , 𝑅 = | 𝑇 ˆ 𝑦 ∩ 𝑇 𝑔𝑜𝑙𝑑 | | 𝑇 𝑔𝑜𝑙𝑑 | , F1 = 2 · 𝑃 · 𝑅 𝑃 + 𝑅 (19)", + "source_ids": [ + 232 + ] + } + ], + "relations": [], + "node_idx": 232 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_233.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_233.json new file mode 100644 index 0000000..0a2e027 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_233.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "15", + "entity_type": "MEASUREMENT", + "description": "15 is a numerical value mentioned in the text potentially representing a measurement or count", + "source_ids": [ + 233 + ] + } + ], + "relations": [], + "node_idx": 233 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_234.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_234.json new file mode 100644 index 0000000..b65b11a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_234.json @@ -0,0 +1,123 @@ +{ + "entities": [ + { + "entity_name": "a.1.3 retrieval recall", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experimental Details' within the 'BookRAG' paper, this section defines the Retrieval Recall metric used to evaluate retrieval quality based on parsed PDF block granularity (paragraphs, tables, images).", + "source_ids": [ + 234 + ] + }, + { + "entity_name": "retrieval quality", + "entity_type": "EVALUATION_METRIC", + "description": "The specific aspect of system performance being measured in this section, assessed via the granularity of retrieved blocks.", + "source_ids": [ + 234 + ] + }, + { + "entity_name": "pdf blocks", + "entity_type": "DATASET_OR_CORPUS", + "description": "The fundamental units of data (paragraphs, tables, images) from which ground-truth and retrieved sets are constructed for evaluation.", + "source_ids": [ + 234 + ] + }, + { + "entity_name": "query q", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The input query variable used to define the set of required ground-truth blocks.", + "source_ids": [ + 234 + ] + }, + { + "entity_name": "b_gold", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The set of manually labeled ground-truth blocks required to answer a given query.", + "source_ids": [ + 234 + ] + }, + { + "entity_name": "b_ret", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "The set of unique blocks retrieved by the system for a given query.", + "source_ids": [ + 234 + ] + }, + { + "entity_name": "recall_ret", + "entity_type": "EVALUATION_METRIC", + "description": "The specific mathematical formula defined in this section to calculate retrieval recall, handling parsing errors.", + "source_ids": [ + 234 + ] + } + ], + "relations": [ + { + "src_entity_name": "retrieval quality", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 9.5, + "description": "Retrieval Quality is the primary concept evaluated within section A.1.3.", + "source_ids": [ + 234 + ] + }, + { + "src_entity_name": "pdf blocks", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "PDF Blocks serve as the granular units of analysis for the evaluation described in section A.1.3.", + "source_ids": [ + 234 + ] + }, + { + "src_entity_name": "query q", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 8.5, + "description": "The variable 'Query q' is a key parameter defined in the context of section A.1.3.", + "source_ids": [ + 234 + ] + }, + { + "src_entity_name": "b_gold", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "The variable 'B_gold' represents the ground truth set utilized in the definition provided in section A.1.3.", + "source_ids": [ + 234 + ] + }, + { + "src_entity_name": "b_ret", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 9.0, + "description": "The variable 'B_ret' represents the retrieved set utilized in the definition provided in section A.1.3.", + "source_ids": [ + 234 + ] + }, + { + "src_entity_name": "recall_ret", + "tgt_entity_name": "a.1.3 retrieval recall", + "relation_name": "", + "weight": 10.0, + "description": "The metric 'Recall_ret' is the central formula and subject explicitly defined in section A.1.3.", + "source_ids": [ + 234 + ] + } + ], + "node_idx": 234 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_235.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_235.json new file mode 100644 index 0000000..ffe5c09 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_235.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (20)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the recall metric r_et as a conditional value based on parsing errors and set intersections. LaTeX: Recall 𝑟𝑒𝑡 = ( 0 if parsing error occurs on B 𝑔𝑜𝑙𝑑 | B 𝑟𝑒𝑡 ∩B 𝑔𝑜𝑙𝑑 | | B 𝑔𝑜𝑙𝑑 | otherwise (20)", + "source_ids": [ + 235 + ] + } + ], + "relations": [], + "node_idx": 235 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_236.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_236.json new file mode 100644 index 0000000..6ef9409 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_236.json @@ -0,0 +1,87 @@ +{ + "entities": [ + { + "entity_name": "pdf", + "entity_type": "FILE_TYPE", + "description": "pdf is a file format mentioned in the context of parsing failures", + "source_ids": [ + 236 + ] + }, + { + "entity_name": "ground truth block", + "entity_type": "TASK_OR_PROBLEM", + "description": "a ground truth block is a specific unit of data that may be lost during parsing", + "source_ids": [ + 236 + ] + }, + { + "entity_name": "candidate pool", + "entity_type": "DATASET_OR_CORPUS", + "description": "the candidate pool is a collection of items from which blocks are retrieved", + "source_ids": [ + 236 + ] + }, + { + "entity_name": "recall", + "entity_type": "EVALUATION_METRIC", + "description": "recall is an evaluation metric used to measure the contribution of retrieved blocks", + "source_ids": [ + 236 + ] + }, + { + "entity_name": "0", + "entity_type": "NUMBER", + "description": "0 is the specific numerical value representing the recall contribution when a block is lost", + "source_ids": [ + 236 + ] + } + ], + "relations": [ + { + "src_entity_name": "ground truth block", + "tgt_entity_name": "pdf", + "relation_name": "", + "weight": 9.0, + "description": "a ground truth block can be lost due to pdf parsing failures", + "source_ids": [ + 236 + ] + }, + { + "src_entity_name": "ground truth block", + "tgt_entity_name": "candidate pool", + "relation_name": "", + "weight": 9.0, + "description": "a ground truth block is considered unretrievable if it does not exist in the candidate pool", + "source_ids": [ + 236 + ] + }, + { + "src_entity_name": "ground truth block", + "tgt_entity_name": "recall", + "relation_name": "", + "weight": 10.0, + "description": "the loss of a ground truth block results in a recall contribution of 0", + "source_ids": [ + 236 + ] + }, + { + "src_entity_name": "recall", + "tgt_entity_name": "0", + "relation_name": "", + "weight": 10.0, + "description": "the recall contribution is explicitly stated as 0 when a ground truth block is lost", + "source_ids": [ + 236 + ] + } + ], + "node_idx": 236 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_237.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_237.json new file mode 100644 index 0000000..eb27b17 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_237.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "a.2 implementation details", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experimental Details' and following 'Evaluation Metrics', this section provides the specific technical configurations, software environments, and parameter settings used to realize the BookRAG system.", + "source_ids": [ + 237 + ] + } + ], + "relations": [], + "node_idx": 237 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_238.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_238.json new file mode 100644 index 0000000..0b7eeec --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_238.json @@ -0,0 +1,861 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a system implemented in python for robust document layout parsing", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "python", + "entity_type": "PROGRAMMING_LANGUAGE", + "description": "python is the programming language used to implement bookrag", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "mineru", + "entity_type": "SOFTWARE", + "description": "mineru is a tool utilized for robust document layout parsing", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "qwen family", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "the qwen family is a set of state of the art backbone models used to power bookrag and baseline methods", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "llm", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "llm refers to large language models a type of model within the qwen family used in the experiments", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "vlm", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "vlm stands for vision language model a type of model within the qwen family used in the experiments", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "embedding models", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "embedding models are a type of model within the qwen family used for text and multi modal embedding", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "qwen3 8b", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "qwen3 8b is the default llm used in the experiments", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "qwen2 5vl 30b", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "qwen2 5vl 30b is the vision language model vlm used in the experiments", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "qwen3 embedding 0 6b", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "qwen3 embedding 0 6b is the model used for text embedding", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "gme qwen2 vl 2b instruct", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "gme qwen2 vl 2b instruct is the model used for multi modal embedding", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "qwen3 reranker 4b", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "qwen3 reranker 4b is the model used for reranking", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "linux", + "entity_type": "SOFTWARE", + "description": "linux is the operating system on which the experiments were conducted", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "intel xeon 2 0ghz cpu", + "entity_type": "HARDWARE", + "description": "intel xeon 2 0ghz cpu is the processor used in the high performance server", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "nvidia geforce rtx a5000", + "entity_type": "HARDWARE", + "description": "nvidia geforce rtx a5000 is the gpu model used in the high performance server", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "1024gb", + "entity_type": "MEASUREMENT", + "description": "1024gb refers to the amount of memory in the server", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "24 gb", + "entity_type": "MEASUREMENT", + "description": "24 gb refers to the vram capacity of each gpu", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "500 tokens", + "entity_type": "MEASUREMENT", + "description": "500 tokens is the standardized chunk size used for document chunking", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "10", + "entity_type": "MEASUREMENT", + "description": "10 is the retrieval top k value set for consistent candidate pool sizes", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "10b parameter scale", + "entity_type": "MEASUREMENT", + "description": "10b parameter scale is the size range of models primarily selected to balance efficiency and effectiveness", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "30b version", + "entity_type": "MEASUREMENT", + "description": "the 30b version refers to the specific size of the vlm adopted due to performance deficits in the 8b counterpart", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "8b counterpart", + "entity_type": "MEASUREMENT", + "description": "the 8b counterpart refers to the smaller version of the vlm that exhibited significant performance deficits", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "github repository", + "entity_type": "LOCATION", + "description": "the github repository is the location where source code and implementation configurations are publicly available", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "https github com sam234990 bookrag", + "entity_type": "LOCATION", + "description": "https github com sam234990 bookrag is the specific url of the repository", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "sam234990", + "entity_type": "PERSON", + "description": "sam234990 is the username associated with the github repository where the source code is hosted", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "baseline methods", + "entity_type": "TASK_OR_PROBLEM", + "description": "baseline methods refer to the existing methods used for fair comparison against bookrag", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "ground truth images", + "entity_type": "IMAGE", + "description": "ground truth images are the correct reference images provided to the models during evaluation", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "document chunking", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "document chunking is a technique used to split documents into smaller parts for processing", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "retrieval ranking", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "retrieval ranking is a technique used to order retrieved candidates based on relevance", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "sequential processing mode", + "entity_type": "TASK_OR_PROBLEM", + "description": "sequential processing mode is the execution mode used to ensure fair comparison of efficiency", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "candidate pool", + "entity_type": "TASK_OR_PROBLEM", + "description": "the candidate pool refers to the set of items retrieved for ranking standardized across baselines", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "efficiency", + "entity_type": "CONCEPT", + "description": "efficiency is a key metric balanced against effectiveness in model selection and execution", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "effectiveness", + "entity_type": "CONCEPT", + "description": "effectiveness is a key metric balanced against efficiency in model selection and execution", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "performance deficits", + "entity_type": "CONCEPT", + "description": "performance deficits describe the failure of the 8b vlm counterpart to answer correctly", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "reproducibility", + "entity_type": "CONCEPT", + "description": "reproducibility is the goal achieved by making source code and configurations publicly available", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "fair comparison", + "entity_type": "CONCEPT", + "description": "fair comparison is the objective driving the use of unified models and standardized parameters", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "robust document layout parsing", + "entity_type": "TASK_OR_PROBLEM", + "description": "robust document layout parsing is the specific task that mineru is utilized for", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "text embedding", + "entity_type": "TASK_OR_PROBLEM", + "description": "text embedding is the task performed by the qwen3 embedding 0 6b model", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "multi modal embedding", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi modal embedding is the task performed by the gme qwen2 vl 2b instruct model", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "reranking", + "entity_type": "TASK_OR_PROBLEM", + "description": "reranking is the task performed by the qwen3 reranker 4b model", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "high performance server", + "entity_type": "LOCATION", + "description": "the high performance server is the physical location where all experiments were conducted", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "source code", + "entity_type": "PRODUCT", + "description": "source code refers to the implementation files of bookrag made available for download", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "implementation configurations", + "entity_type": "PRODUCT", + "description": "implementation configurations refer to the detailed settings used to run the experiments", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "reference 52", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 52 is the citation for the mineru tool", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "reference 4", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 4 is the citation for the qwen2 5vl 30b model", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "reference 60", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 60 is the citation for the qwen3 8b model", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "reference 63", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 63 is the citation for the gme qwen2 vl 2b instruct model", + "source_ids": [ + 238 + ] + }, + { + "entity_name": "reference 64", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 64 is the citation for the qwen3 embedding 0 6b and qwen3 reranker 4b models", + "source_ids": [ + 238 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "python", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is implemented in python", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "mineru", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes mineru for robust document layout parsing", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "qwen family", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is powered by models from the qwen family", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "the qwen family includes llms used in the experiments", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "vlm", + "relation_name": "", + "weight": 9.0, + "description": "the qwen family includes vlms used in the experiments", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "embedding models", + "relation_name": "", + "weight": 9.0, + "description": "the qwen family includes embedding models used in the experiments", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "qwen3 8b", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 8b is a specific model from the qwen family used as the default llm", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "qwen2 5vl 30b", + "relation_name": "", + "weight": 10.0, + "description": "qwen2 5vl 30b is a specific model from the qwen family used as the vlm", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "qwen3 embedding 0 6b", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 embedding 0 6b is a specific model from the qwen family used for text embedding", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "gme qwen2 vl 2b instruct", + "relation_name": "", + "weight": 10.0, + "description": "gme qwen2 vl 2b instruct is a specific model from the qwen family used for multi modal embedding", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen family", + "tgt_entity_name": "qwen3 reranker 4b", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 reranker 4b is a specific model from the qwen family used for reranking", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "linux", + "relation_name": "", + "weight": 8.0, + "description": "experiments for bookrag were conducted on a linux operating system", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "linux", + "tgt_entity_name": "intel xeon 2 0ghz cpu", + "relation_name": "", + "weight": 9.0, + "description": "the linux operating system runs on a server equipped with an intel xeon 2 0ghz cpu", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "linux", + "tgt_entity_name": "nvidia geforce rtx a5000", + "relation_name": "", + "weight": 9.0, + "description": "the linux operating system runs on a server equipped with nvidia geforce rtx a5000 gpus", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "intel xeon 2 0ghz cpu", + "tgt_entity_name": "1024gb", + "relation_name": "", + "weight": 8.0, + "description": "the server with the intel xeon 2 0ghz cpu has 1024gb of memory", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "nvidia geforce rtx a5000", + "tgt_entity_name": "24 gb", + "relation_name": "", + "weight": 9.0, + "description": "each nvidia geforce rtx a5000 gpu has 24 gb of vram", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "500 tokens", + "relation_name": "", + "weight": 8.0, + "description": "bookrag standardizes the chunk size at 500 tokens for document chunking", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "10", + "relation_name": "", + "weight": 8.0, + "description": "bookrag sets the retrieval top k to 10 for consistent candidate pool sizes", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "10b parameter scale", + "relation_name": "", + "weight": 9.0, + "description": "bookrag primarily selects models under the 10b parameter scale", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "30b version", + "relation_name": "", + "weight": 9.0, + "description": "bookrag adopts the 30b version of the vlm due to performance issues with the 8b counterpart", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "8b counterpart", + "relation_name": "", + "weight": 8.0, + "description": "the 8b counterpart of the vlm exhibited significant performance deficits leading to the adoption of the 30b version", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "https github com sam234990 bookrag", + "relation_name": "", + "weight": 10.0, + "description": "the source code and configurations for bookrag are available at the specified github url", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "sam234990", + "tgt_entity_name": "https github com sam234990 bookrag", + "relation_name": "", + "weight": 10.0, + "description": "sam234990 is the owner of the github repository url", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "baseline methods", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is compared against baseline methods to ensure a fair comparison", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen3 8b", + "tgt_entity_name": "ground truth images", + "relation_name": "", + "weight": 7.0, + "description": "the 8b counterpart related to qwen3 8b context failed to answer correctly even with ground truth images", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "document chunking", + "relation_name": "", + "weight": 8.0, + "description": "bookrag involves document chunking as part of its processing pipeline", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval ranking", + "relation_name": "", + "weight": 8.0, + "description": "bookrag involves retrieval ranking as part of its processing pipeline", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "sequential processing mode", + "relation_name": "", + "weight": 9.0, + "description": "bookrag methods were executed in sequential processing mode to ensure fair efficiency comparison", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "candidate pool", + "relation_name": "", + "weight": 8.0, + "description": "bookrag standardizes the candidate pool size across baselines", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "efficiency", + "relation_name": "", + "weight": 9.0, + "description": "bookrag balances efficiency and effectiveness in model selection", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "effectiveness", + "relation_name": "", + "weight": 9.0, + "description": "bookrag balances efficiency and effectiveness in model selection", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "8b counterpart", + "tgt_entity_name": "performance deficits", + "relation_name": "", + "weight": 10.0, + "description": "the 8b counterpart exhibited performance deficits", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "reproducibility", + "relation_name": "", + "weight": 9.0, + "description": "bookrag aims for reproducibility by making code and configs public", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "fair comparison", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is designed to enable a fair comparison with other methods", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "mineru", + "tgt_entity_name": "robust document layout parsing", + "relation_name": "", + "weight": 10.0, + "description": "mineru is utilized for robust document layout parsing", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen3 embedding 0 6b", + "tgt_entity_name": "text embedding", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 embedding 0 6b is used for text embedding", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "gme qwen2 vl 2b instruct", + "tgt_entity_name": "multi modal embedding", + "relation_name": "", + "weight": 10.0, + "description": "gme qwen2 vl 2b instruct is used for multi modal embedding", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen3 reranker 4b", + "tgt_entity_name": "reranking", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 reranker 4b is used for reranking", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "linux", + "tgt_entity_name": "high performance server", + "relation_name": "", + "weight": 9.0, + "description": "the linux operating system runs on the high performance server", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "source code", + "relation_name": "", + "weight": 10.0, + "description": "the source code for bookrag is available at the repository", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "implementation configurations", + "relation_name": "", + "weight": 10.0, + "description": "the implementation configurations for bookrag are available at the repository", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "mineru", + "tgt_entity_name": "reference 52", + "relation_name": "", + "weight": 10.0, + "description": "mineru is cited in reference 52", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen2 5vl 30b", + "tgt_entity_name": "reference 4", + "relation_name": "", + "weight": 10.0, + "description": "qwen2 5vl 30b is cited in reference 4", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen3 8b", + "tgt_entity_name": "reference 60", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 8b is cited in reference 60", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "gme qwen2 vl 2b instruct", + "tgt_entity_name": "reference 63", + "relation_name": "", + "weight": 10.0, + "description": "gme qwen2 vl 2b instruct is cited in reference 63", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen3 embedding 0 6b", + "tgt_entity_name": "reference 64", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 embedding 0 6b is cited in reference 64", + "source_ids": [ + 238 + ] + }, + { + "src_entity_name": "qwen3 reranker 4b", + "tgt_entity_name": "reference 64", + "relation_name": "", + "weight": 10.0, + "description": "qwen3 reranker 4b is cited in reference 64", + "source_ids": [ + 238 + ] + } + ], + "node_idx": 238 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_239.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_239.json new file mode 100644 index 0000000..0d7dacc --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_239.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "a.3 prompts", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Experimental Details' within the BookRAG paper, this section details the specific text prompts engineered and utilized to guide the Retrieval-Augmented Generation (RAG) system in processing complex documents.", + "source_ids": [ + 239 + ] + }, + { + "entity_name": "prompts", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Refers to the structured input instructions provided to the language model to elicit specific behaviors or outputs, as defined in section A.3.", + "source_ids": [ + 239 + ] + } + ], + "relations": [ + { + "src_entity_name": "prompts", + "tgt_entity_name": "a.3 prompts", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Prompts' is the primary topic of section A.3.", + "source_ids": [ + 239 + ] + } + ], + "node_idx": 239 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_24.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_24.json new file mode 100644 index 0000000..75aade4 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_24.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "our contributions", + "entity_type": "TASK_OR_PROBLEM", + "description": "our contributions refers to the summary of work or achievements presented in the text", + "source_ids": [ + 24 + ] + } + ], + "relations": [], + "node_idx": 24 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_240.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_240.json new file mode 100644 index 0000000..a3ddbee --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_240.json @@ -0,0 +1,217 @@ +{ + "entities": [ + { + "entity_name": "agent based query classification", + "entity_type": "TASK_OR_PROBLEM", + "description": "agent based query classification is a task for which prompts are designed as illustrated in figure 10", + "source_ids": [ + 240 + ] + }, + { + "entity_name": "question decomposition", + "entity_type": "TASK_OR_PROBLEM", + "description": "question decomposition is a task for which prompts are designed as illustrated in figure 11", + "source_ids": [ + 240 + ] + }, + { + "entity_name": "filter operator generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "filter operator generation is a task for which prompts are designed as illustrated in figure 12", + "source_ids": [ + 240 + ] + }, + { + "entity_name": "entity resolution judgment", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity resolution judgment is a task for which a prompt is employed during the graph construction phase as illustrated in figure 13", + "source_ids": [ + 240 + ] + }, + { + "entity_name": "graph construction phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "graph construction phase is the specific phase during which entity resolution judgment is performed", + "source_ids": [ + 240 + ] + }, + { + "entity_name": "prompts", + "entity_type": "PRODUCT", + "description": "prompts are the specific designed items mentioned in the text for various tasks", + "source_ids": [ + 240 + ] + }, + { + "entity_name": "figure 10", + "entity_type": "IMAGE", + "description": "figure 10 is a visual element illustrating prompts for agent based query classification", + "source_ids": [ + 240 + ] + }, + { + "entity_name": "figure 11", + "entity_type": "IMAGE", + "description": "figure 11 is a visual element illustrating prompts for question decomposition", + "source_ids": [ + 240 + ] + }, + { + "entity_name": "figure 12", + "entity_type": "IMAGE", + "description": "figure 12 is a visual element illustrating prompts for filter operator generation", + "source_ids": [ + 240 + ] + }, + { + "entity_name": "figure 13", + "entity_type": "IMAGE", + "description": "figure 13 is a visual element illustrating the prompt for entity resolution judgment", + "source_ids": [ + 240 + ] + } + ], + "relations": [ + { + "src_entity_name": "agent based query classification", + "tgt_entity_name": "figure 10", + "relation_name": "", + "weight": 10.0, + "description": "figure 10 illustrates the prompts designed for agent based query classification", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "question decomposition", + "tgt_entity_name": "figure 11", + "relation_name": "", + "weight": 10.0, + "description": "figure 11 illustrates the prompts designed for question decomposition", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "filter operator generation", + "tgt_entity_name": "figure 12", + "relation_name": "", + "weight": 10.0, + "description": "figure 12 illustrates the prompts designed for filter operator generation", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "entity resolution judgment", + "tgt_entity_name": "figure 13", + "relation_name": "", + "weight": 10.0, + "description": "figure 13 illustrates the prompt employed for entity resolution judgment", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "entity resolution judgment", + "tgt_entity_name": "graph construction phase", + "relation_name": "", + "weight": 9.0, + "description": "entity resolution judgment is performed during the graph construction phase", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "prompts", + "tgt_entity_name": "agent based query classification", + "relation_name": "", + "weight": 10.0, + "description": "prompts are designed specifically for agent based query classification", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "prompts", + "tgt_entity_name": "question decomposition", + "relation_name": "", + "weight": 10.0, + "description": "prompts are designed specifically for question decomposition", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "prompts", + "tgt_entity_name": "filter operator generation", + "relation_name": "", + "weight": 10.0, + "description": "prompts are designed specifically for filter operator generation", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "prompts", + "tgt_entity_name": "entity resolution judgment", + "relation_name": "", + "weight": 10.0, + "description": "a prompt is employed for entity resolution judgment", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "figure 10", + "tgt_entity_name": "agent based query classification", + "relation_name": "", + "weight": 10.0, + "description": "figure 10 presents the prompts for agent based query classification", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "figure 11", + "tgt_entity_name": "question decomposition", + "relation_name": "", + "weight": 10.0, + "description": "figure 11 presents the prompts for question decomposition", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "figure 12", + "tgt_entity_name": "filter operator generation", + "relation_name": "", + "weight": 10.0, + "description": "figure 12 presents the prompts for filter operator generation", + "source_ids": [ + 240 + ] + }, + { + "src_entity_name": "figure 13", + "tgt_entity_name": "entity resolution judgment", + "relation_name": "", + "weight": 10.0, + "description": "figure 13 illustrates the prompt for entity resolution judgment", + "source_ids": [ + 240 + ] + } + ], + "node_idx": 240 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_241.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_241.json new file mode 100644 index 0000000..991f280 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_241.json @@ -0,0 +1,105 @@ +{ + "entities": [ + { + "entity_name": "expert query analyzer", + "entity_type": "PERSON", + "description": "an expert query analyzer is a role described as someone tasked with classifying user questions into specific categories", + "source_ids": [ + 241 + ] + }, + { + "entity_name": "simple", + "entity_type": "TASK_OR_PROBLEM", + "description": "simple is one of the three categories used to classify user questions", + "source_ids": [ + 241 + ] + }, + { + "entity_name": "complex", + "entity_type": "TASK_OR_PROBLEM", + "description": "complex is one of the three categories used to classify user questions", + "source_ids": [ + 241 + ] + }, + { + "entity_name": "global", + "entity_type": "TASK_OR_PROBLEM", + "description": "global is one of the three categories used to classify user questions", + "source_ids": [ + 241 + ] + }, + { + "entity_name": "user", + "entity_type": "PERSON", + "description": "the user is the entity whose questions are being classified by the expert query analyzer", + "source_ids": [ + 241 + ] + }, + { + "entity_name": "json object", + "entity_type": "FILE_TYPE", + "description": "the json object is the required format for the response from the expert query analyzer", + "source_ids": [ + 241 + ] + } + ], + "relations": [ + { + "src_entity_name": "expert query analyzer", + "tgt_entity_name": "simple", + "relation_name": "", + "weight": 9.0, + "description": "the expert query analyzer classifies questions into the simple category", + "source_ids": [ + 241 + ] + }, + { + "src_entity_name": "expert query analyzer", + "tgt_entity_name": "complex", + "relation_name": "", + "weight": 9.0, + "description": "the expert query analyzer classifies questions into the complex category", + "source_ids": [ + 241 + ] + }, + { + "src_entity_name": "expert query analyzer", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 9.0, + "description": "the expert query analyzer classifies questions into the global category", + "source_ids": [ + 241 + ] + }, + { + "src_entity_name": "expert query analyzer", + "tgt_entity_name": "user", + "relation_name": "", + "weight": 8.0, + "description": "the expert query analyzer processes questions submitted by the user", + "source_ids": [ + 241 + ] + }, + { + "src_entity_name": "expert query analyzer", + "tgt_entity_name": "json object", + "relation_name": "", + "weight": 9.0, + "description": "the expert query analyzer must respond using the specified json object format", + "source_ids": [ + 241 + ] + } + ], + "node_idx": 241 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_242.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_242.json new file mode 100644 index 0000000..099e6ec --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_242.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "category definitions", + "entity_type": "SECTION_TITLE", + "description": "category definitions is the title of the section containing definitions for entity types", + "source_ids": [ + 242 + ] + } + ], + "relations": [], + "node_idx": 242 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_243.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_243.json new file mode 100644 index 0000000..db3f957 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_243.json @@ -0,0 +1,199 @@ +{ + "entities": [ + { + "entity_name": "single hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "single hop is a task type where a question can be answered by retrieving information from a single location", + "source_ids": [ + 243 + ] + }, + { + "entity_name": "question", + "entity_type": "TASK_OR_PROBLEM", + "description": "question is the item that needs to be answered in the single hop task", + "source_ids": [ + 243 + ] + }, + { + "entity_name": "information", + "entity_type": "CONCEPT", + "description": "information is the data retrieved to answer the question", + "source_ids": [ + 243 + ] + }, + { + "entity_name": "document", + "entity_type": "CONCEPT", + "description": "document is the source material containing the information", + "source_ids": [ + 243 + ] + }, + { + "entity_name": "paragraph", + "entity_type": "SECTION_TITLE", + "description": "paragraph is an example of a contiguous location within a document", + "source_ids": [ + 243 + ] + }, + { + "entity_name": "table", + "entity_type": "SECTION_TITLE", + "description": "table is an example of a contiguous location within a document", + "source_ids": [ + 243 + ] + }, + { + "entity_name": "figure", + "entity_type": "SECTION_TITLE", + "description": "figure is an example of a contiguous location within a document", + "source_ids": [ + 243 + ] + }, + { + "entity_name": "single", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 243 + ] + }, + { + "entity_name": "contiguous location", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 243 + ] + } + ], + "relations": [ + { + "src_entity_name": "single hop", + "tgt_entity_name": "single", + "relation_name": "", + "weight": 9.0, + "description": "the single hop task requires retrieving information from a single location", + "source_ids": [ + 243 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "contiguous location", + "relation_name": "", + "weight": 8.0, + "description": "the single hop task involves information found in a contiguous location", + "source_ids": [ + 243 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 7.0, + "description": "the single hop task is defined within the context of a document", + "source_ids": [ + 243 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "question", + "relation_name": "", + "weight": 10.0, + "description": "the single hop task is defined by the ability to answer a question", + "source_ids": [ + 243 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "information", + "relation_name": "", + "weight": 10.0, + "description": "the single hop task involves retrieving information", + "source_ids": [ + 243 + ] + }, + { + "src_entity_name": "information", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "information is retrieved from the document", + "source_ids": [ + 243 + ] + }, + { + "src_entity_name": "paragraph", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "a paragraph is a part of a document", + "source_ids": [ + 243 + ] + }, + { + "src_entity_name": "table", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "a table is a part of a document", + "source_ids": [ + 243 + ] + }, + { + "src_entity_name": "figure", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "a figure is a part of a document", + "source_ids": [ + 243 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "paragraph", + "relation_name": "", + "weight": 9.0, + "description": "a single hop question can be answered by retrieving information from a paragraph", + "source_ids": [ + 243 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 9.0, + "description": "a single hop question can be answered by retrieving information from a table", + "source_ids": [ + 243 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "figure", + "relation_name": "", + "weight": 9.0, + "description": "a single hop question can be answered by retrieving information from a figure", + "source_ids": [ + 243 + ] + } + ], + "node_idx": 243 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_244.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_244.json new file mode 100644 index 0000000..50ac9aa --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_244.json @@ -0,0 +1,5 @@ +{ + "entities": [], + "relations": [], + "node_idx": 244 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_245.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_245.json new file mode 100644 index 0000000..aa39832 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_245.json @@ -0,0 +1,25 @@ +{ + "entities": [ + { + "entity_name": "figure 2", + "entity_type": "IMAGE", + "description": "figure 2 is an image referenced in the text with its title being the subject of a question", + "source_ids": [ + 245 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 2", + "tgt_entity_name": "figure 2", + "relation_name": "", + "weight": 5.0, + "description": "the text asks for the title of figure 2 indicating a self referential query about the entity s attribute", + "source_ids": [ + 245 + ] + } + ], + "node_idx": 245 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_246.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_246.json new file mode 100644 index 0000000..4e38c25 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_246.json @@ -0,0 +1,79 @@ +{ + "entities": [ + { + "entity_name": "5", + "entity_type": "PERCENTAGE", + "description": "5 represents a specific portion of the latino population mentioned in the context of economic upward mobility", + "source_ids": [ + 246 + ] + }, + { + "entity_name": "latinos", + "entity_type": "NATIONALITY", + "description": "latinos are the demographic group whose views on economic upward mobility for their children are being queried", + "source_ids": [ + 246 + ] + }, + { + "entity_name": "economic upward mobility", + "entity_type": "TASK_OR_PROBLEM", + "description": "economic upward mobility is the specific issue regarding the children of latinos that is the subject of the inquiry", + "source_ids": [ + 246 + ] + }, + { + "entity_name": "children", + "entity_type": "PERSON", + "description": "children are the offspring of the latinos whose economic upward mobility is being discussed", + "source_ids": [ + 246 + ] + } + ], + "relations": [ + { + "src_entity_name": "5", + "tgt_entity_name": "latinos", + "relation_name": "", + "weight": 9.0, + "description": "the percentage 5 specifically refers to a subset of the latino population", + "source_ids": [ + 246 + ] + }, + { + "src_entity_name": "latinos", + "tgt_entity_name": "economic upward mobility", + "relation_name": "", + "weight": 10.0, + "description": "latinos are the group whose perspective on economic upward mobility for their children is being examined", + "source_ids": [ + 246 + ] + }, + { + "src_entity_name": "latinos", + "tgt_entity_name": "children", + "relation_name": "", + "weight": 10.0, + "description": "the children belong to the latino demographic group mentioned in the text", + "source_ids": [ + 246 + ] + }, + { + "src_entity_name": "economic upward mobility", + "tgt_entity_name": "children", + "relation_name": "", + "weight": 9.0, + "description": "economic upward mobility is the specific attribute or outcome being considered for the children", + "source_ids": [ + 246 + ] + } + ], + "node_idx": 246 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_247.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_247.json new file mode 100644 index 0000000..798e176 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_247.json @@ -0,0 +1,25 @@ +{ + "entities": [ + { + "entity_name": "multi hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "multi hop is a task that requires decomposition into multiple simple sub questions", + "source_ids": [ + 247 + ] + } + ], + "relations": [ + { + "src_entity_name": "multi hop", + "tgt_entity_name": "multi hop", + "relation_name": "", + "weight": 10.0, + "description": "the entity multi hop is described as requiring decomposition into sub questions indicating its nature as a task", + "source_ids": [ + 247 + ] + } + ], + "node_idx": 247 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_248.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_248.json new file mode 100644 index 0000000..c01c8d2 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_248.json @@ -0,0 +1,5 @@ +{ + "entities": [], + "relations": [], + "node_idx": 248 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_249.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_249.json new file mode 100644 index 0000000..e6c976d --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_249.json @@ -0,0 +1,25 @@ +{ + "entities": [ + { + "entity_name": "personality vector", + "entity_type": "TASK_OR_PROBLEM", + "description": "the personality vector is a concept mentioned in a question regarding its color indicating it is a complex retrieval task", + "source_ids": [ + 249 + ] + } + ], + "relations": [ + { + "src_entity_name": "personality vector", + "tgt_entity_name": "personality vector", + "relation_name": "", + "weight": 5.0, + "description": "the entity is the subject of a question asking for its color implying a self referential query about its attributes", + "source_ids": [ + 249 + ] + } + ], + "node_idx": 249 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_25.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_25.json new file mode 100644 index 0000000..fea3f8b --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_25.json @@ -0,0 +1,145 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "bookrag is a novel method introduced in the text that constructs a document native bookindex", + "source_ids": [ + 25 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "bookindex is a document native index constructed by the bookrag method", + "source_ids": [ + 25 + ] + }, + { + "entity_name": "hierarchical tree", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "a hierarchical tree of document layout blocks is integrated by bookrag to construct the bookindex", + "source_ids": [ + 25 + ] + }, + { + "entity_name": "kg", + "entity_type": "SOFTWARE", + "description": "kg refers to a knowledge graph storing fine grained entity relations used in the bookrag method", + "source_ids": [ + 25 + ] + }, + { + "entity_name": "document layout blocks", + "entity_type": "MATERIAL", + "description": "document layout blocks are the structural components of a document that are organized into a hierarchical tree", + "source_ids": [ + 25 + ] + }, + { + "entity_name": "entity relations", + "entity_type": "CONCEPT", + "description": "entity relations are the fine grained connections between entities stored within the knowledge graph", + "source_ids": [ + 25 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "bookrag constructs the bookindex by integrating other components", + "source_ids": [ + 25 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "hierarchical tree", + "relation_name": "", + "weight": 9.0, + "description": "bookrag integrates a hierarchical tree of document layout blocks", + "source_ids": [ + 25 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "bookrag integrates a kg storing fine grained entity relations", + "source_ids": [ + 25 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "hierarchical tree", + "relation_name": "", + "weight": 8.0, + "description": "the bookindex is constructed using a hierarchical tree of document layout blocks", + "source_ids": [ + 25 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 8.0, + "description": "the bookindex is constructed using a kg storing fine grained entity relations", + "source_ids": [ + 25 + ] + }, + { + "src_entity_name": "hierarchical tree", + "tgt_entity_name": "document layout blocks", + "relation_name": "", + "weight": 10.0, + "description": "the hierarchical tree is composed of document layout blocks", + "source_ids": [ + 25 + ] + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "entity relations", + "relation_name": "", + "weight": 10.0, + "description": "the kg stores fine grained entity relations", + "source_ids": [ + 25 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "document layout blocks", + "relation_name": "", + "weight": 8.0, + "description": "bookrag integrates document layout blocks via a hierarchical tree", + "source_ids": [ + 25 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "entity relations", + "relation_name": "", + "weight": 8.0, + "description": "bookrag utilizes a kg that stores entity relations", + "source_ids": [ + 25 + ] + } + ], + "node_idx": 25 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_250.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_250.json new file mode 100644 index 0000000..f9c8a72 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_250.json @@ -0,0 +1,153 @@ +{ + "entities": [ + { + "entity_name": "global", + "entity_type": "TASK_OR_PROBLEM", + "description": "global refers to a type of question requiring an aggregation operation over a set of items identified by a structural filter", + "source_ids": [ + 250 + ] + }, + { + "entity_name": "counting", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "counting is an example of an aggregation operation mentioned in the text", + "source_ids": [ + 250 + ] + }, + { + "entity_name": "listing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "listing is an example of an aggregation operation mentioned in the text", + "source_ids": [ + 250 + ] + }, + { + "entity_name": "summarizing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "summarizing is an example of an aggregation operation mentioned in the text", + "source_ids": [ + 250 + ] + }, + { + "entity_name": "structural filter", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "structural filter is a clear filter used to identify items in the set for the global question", + "source_ids": [ + 250 + ] + }, + { + "entity_name": "aggregation operation", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 250 + ] + }, + { + "entity_name": "items", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 250 + ] + } + ], + "relations": [ + { + "src_entity_name": "global", + "tgt_entity_name": "aggregation operation", + "relation_name": "", + "weight": 8.0, + "description": "the global task requires an aggregation operation such as counting listing or summarizing", + "source_ids": [ + 250 + ] + }, + { + "src_entity_name": "global", + "tgt_entity_name": "counting", + "relation_name": "", + "weight": 9.0, + "description": "the global task includes counting as a possible aggregation operation", + "source_ids": [ + 250 + ] + }, + { + "src_entity_name": "global", + "tgt_entity_name": "listing", + "relation_name": "", + "weight": 9.0, + "description": "the global task includes listing as a possible aggregation operation", + "source_ids": [ + 250 + ] + }, + { + "src_entity_name": "global", + "tgt_entity_name": "summarizing", + "relation_name": "", + "weight": 9.0, + "description": "the global task includes summarizing as a possible aggregation operation", + "source_ids": [ + 250 + ] + }, + { + "src_entity_name": "global", + "tgt_entity_name": "structural filter", + "relation_name": "", + "weight": 9.0, + "description": "the global task identifies items using a clear structural filter", + "source_ids": [ + 250 + ] + }, + { + "src_entity_name": "counting", + "tgt_entity_name": "aggregation operation", + "relation_name": "", + "weight": 10.0, + "description": "counting is explicitly described as an aggregation operation", + "source_ids": [ + 250 + ] + }, + { + "src_entity_name": "listing", + "tgt_entity_name": "aggregation operation", + "relation_name": "", + "weight": 10.0, + "description": "listing is explicitly described as an aggregation operation", + "source_ids": [ + 250 + ] + }, + { + "src_entity_name": "summarizing", + "tgt_entity_name": "aggregation operation", + "relation_name": "", + "weight": 10.0, + "description": "summarizing is explicitly described as an aggregation operation", + "source_ids": [ + 250 + ] + }, + { + "src_entity_name": "structural filter", + "tgt_entity_name": "items", + "relation_name": "", + "weight": 8.0, + "description": "the structural filter is used to identify the set of items", + "source_ids": [ + 250 + ] + } + ], + "node_idx": 250 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_251.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_251.json new file mode 100644 index 0000000..b2c0c50 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_251.json @@ -0,0 +1,51 @@ +{ + "entities": [ + { + "entity_name": "example", + "entity_type": "TASK_OR_PROBLEM", + "description": "example is a task or problem asking how many tables are in the document", + "source_ids": [ + 251 + ] + }, + { + "entity_name": "global", + "entity_type": "CONCEPT", + "description": "global refers to a process that filters for all items of a specific type in this case table", + "source_ids": [ + 251 + ] + }, + { + "entity_name": "table", + "entity_type": "PRODUCT", + "description": "table is the specific item type being filtered for in the document", + "source_ids": [ + 251 + ] + } + ], + "relations": [ + { + "src_entity_name": "example", + "tgt_entity_name": "global", + "relation_name": "", + "weight": 9.0, + "description": "the example task is defined by the global process of filtering for tables", + "source_ids": [ + 251 + ] + }, + { + "src_entity_name": "global", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 10.0, + "description": "the global process specifically targets and filters for items of type table", + "source_ids": [ + 251 + ] + } + ], + "node_idx": 251 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_252.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_252.json new file mode 100644 index 0000000..2eda525 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_252.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "user query", + "entity_type": "TASK_OR_PROBLEM", + "description": "user query is a task or problem mentioned in the text representing a request for information or action", + "source_ids": [ + 252 + ] + } + ], + "relations": [], + "node_idx": 252 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_253.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_253.json new file mode 100644 index 0000000..d09bbea --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_253.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "figure 10", + "entity_type": "IMAGE", + "description": "figure 10 is an image in the text that displays a prompt for query classification", + "source_ids": [ + 253 + ] + }, + { + "entity_name": "query classification", + "entity_type": "TASK_OR_PROBLEM", + "description": "query classification is the task or problem for which the prompt in figure 10 is designed", + "source_ids": [ + 253 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 10", + "tgt_entity_name": "query classification", + "relation_name": "", + "weight": 10.0, + "description": "figure 10 contains the prompt specifically used for query classification", + "source_ids": [ + 253 + ] + } + ], + "node_idx": 253 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_254.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_254.json new file mode 100644 index 0000000..7ae42d0 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_254.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "16", + "entity_type": "MEASUREMENT", + "description": "16 is a numerical value mentioned in the text potentially representing a count date or measurement", + "source_ids": [ + 254 + ] + } + ], + "relations": [], + "node_idx": 254 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_255.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_255.json new file mode 100644 index 0000000..8654076 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_255.json @@ -0,0 +1,233 @@ +{ + "entities": [ + { + "entity_name": "user a2gbifl43u1lkj", + "entity_type": "PERSON", + "description": "user a2gbifl43u1lkj is a specific user referenced in the example query regarding personality vectors and receptiviti scores", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "foreign born latinos", + "entity_type": "PERSON", + "description": "foreign born latinos are a demographic group mentioned in the example query regarding population surveys", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "latinos interviewed by cellphone", + "entity_type": "PERSON", + "description": "latinos interviewed by cellphone are a demographic group mentioned in the example query regarding population surveys", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "soft labeled personality embedding matrix", + "entity_type": "PRODUCT", + "description": "the soft labeled personality embedding matrix is a data structure containing personality vectors and their associated colors", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "receptiviti score", + "entity_type": "EVALUATION_METRIC", + "description": "the receptiviti score is a metric used to evaluate personality vectors in the context of the example query", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "population", + "entity_type": "MEASUREMENT", + "description": "population refers to the count of individuals in a specific demographic group within a survey", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "query decomposition expert", + "entity_type": "PROFESSION", + "description": "the query decomposition expert is the role assigned to the ai to break down complex questions into atomic sub questions", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "complex question", + "entity_type": "TASK_OR_PROBLEM", + "description": "a complex question is the input task that needs to be broken down into simple sub questions", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "simple atomic sub questions", + "entity_type": "TASK_OR_PROBLEM", + "description": "simple atomic sub questions are the output components of the decomposition process each being a direct information retrieval task", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "retrieval sub question", + "entity_type": "TASK_OR_PROBLEM", + "description": "a retrieval sub question is a specific type of sub question that requires looking up a specific fact number or value in the document", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "synthesis question", + "entity_type": "TASK_OR_PROBLEM", + "description": "a synthesis question is a specific type of sub question that requires comparing calculating or combining answers from previous retrieval questions", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "json object", + "entity_type": "FILE_TYPE", + "description": "the json object is the required format for the response containing a single key sub questions with a list of objects", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "sub questions", + "entity_type": "SECTION_TITLE", + "description": "the sub questions key is the container within the json object that holds the list of decomposed questions", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "question", + "entity_type": "SECTION_TITLE", + "description": "the question key within each sub question object holds the string of the actual question", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "type", + "entity_type": "SECTION_TITLE", + "description": "the type key within each sub question object specifies whether the question is retrieval or synthesis", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "user query", + "entity_type": "TASK_OR_PROBLEM", + "description": "the user query is the final input provided in the real data section which is the word query itself", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "example 1", + "entity_type": "EVENT", + "description": "example 1 is a demonstration of correct decomposition with independent lookups provided in the text", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "example 2", + "entity_type": "EVENT", + "description": "example 2 is a demonstration of decomposition with retrieval and synthesis steps provided in the text", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "personality vector", + "entity_type": "PRODUCT", + "description": "a personality vector is a data element within the soft labeled personality embedding matrix", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "color", + "entity_type": "COLOR", + "description": "color is an attribute mapped to personality vectors in the soft labeled personality embedding matrix", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "survey", + "entity_type": "EVENT", + "description": "the survey is the context in which population data for latinos is collected in example 2", + "source_ids": [ + 255 + ] + }, + { + "entity_name": "report", + "entity_type": "BOOK", + "description": "the report is the document referenced in example 2 that contains population data", + "source_ids": [ + 255 + ] + } + ], + "relations": [ + { + "src_entity_name": "user a2gbifl43u1lkj", + "tgt_entity_name": "soft labeled personality embedding matrix", + "relation_name": "", + "weight": 9.0, + "description": "user a2gbifl43u1lkj is the subject for whom personality vectors are analyzed within the soft labeled personality embedding matrix", + "source_ids": [ + 255 + ] + }, + { + "src_entity_name": "user a2gbifl43u1lkj", + "tgt_entity_name": "receptiviti score", + "relation_name": "", + "weight": 9.0, + "description": "receptiviti scores are calculated for the personality vectors associated with user a2gbifl43u1lkj", + "source_ids": [ + 255 + ] + }, + { + "src_entity_name": "foreign born latinos", + "tgt_entity_name": "population", + "relation_name": "", + "weight": 8.0, + "description": "the population of foreign born latinos is a specific value sought in the survey example", + "source_ids": [ + 255 + ] + }, + { + "src_entity_name": "latinos interviewed by cellphone", + "tgt_entity_name": "population", + "relation_name": "", + "weight": 8.0, + "description": "the population of latinos interviewed by cellphone is a specific value sought in the survey example", + "source_ids": [ + 255 + ] + }, + { + "src_entity_name": "soft labeled personality embedding matrix", + "tgt_entity_name": "receptiviti score", + "relation_name": "", + "weight": 7.0, + "description": "the soft labeled personality embedding matrix contains personality vectors that are evaluated using receptiviti scores", + "source_ids": [ + 255 + ] + } + ], + "node_idx": 255 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_256.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_256.json new file mode 100644 index 0000000..071d0a5 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_256.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "figure 11", + "entity_type": "IMAGE", + "description": "figure 11 is an image in the text that displays a prompt for query decomposition", + "source_ids": [ + 256 + ] + }, + { + "entity_name": "query decomposition", + "entity_type": "TASK_OR_PROBLEM", + "description": "query decomposition is the task or problem for which the prompt in figure 11 is designed", + "source_ids": [ + 256 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 11", + "tgt_entity_name": "query decomposition", + "relation_name": "", + "weight": 10.0, + "description": "figure 11 contains the prompt specifically for the task of query decomposition", + "source_ids": [ + 256 + ] + } + ], + "node_idx": 256 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_257.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_257.json new file mode 100644 index 0000000..e84ed91 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_257.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "17", + "entity_type": "NUMBER", + "description": "17 is a number mentioned in the text though its specific context or role is not defined", + "source_ids": [ + 257 + ] + } + ], + "relations": [], + "node_idx": 257 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_258.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_258.json new file mode 100644 index 0000000..86bcc44 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_258.json @@ -0,0 +1,521 @@ +{ + "entities": [ + { + "entity_name": "ai assistant", + "entity_type": "PERSON", + "description": "an ai assistant described as highly specialized with the function of analyzing a global query", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "global query", + "entity_type": "TASK_OR_PROBLEM", + "description": "a query that the ai assistant is designed to analyze to determine filtering steps and aggregation operations", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "json object", + "entity_type": "FILE_TYPE", + "description": "the specific output format required from the ai assistant containing filters and an operation", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "filters", + "entity_type": "TASK_OR_PROBLEM", + "description": "a list of filtering steps to be applied which can include sections images tables or pages", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "operation", + "entity_type": "TASK_OR_PROBLEM", + "description": "the final aggregation operation to be performed such as count list summarize or analyze", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "methodology", + "entity_type": "SECTION_TITLE", + "description": "a specific section title mentioned in an example query regarding data augmentation", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "paper", + "entity_type": "BOOK", + "description": "a document referenced in an example query regarding figures on specific pages", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "report", + "entity_type": "BOOK", + "description": "a document referenced in an example query regarding chapters", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "user", + "entity_type": "PERSON", + "description": "the user is the entity providing the query to the ai assistant", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "assistant", + "entity_type": "PERSON", + "description": "the assistant is the entity responding to the user with a json object", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "chapter", + "entity_type": "SECTION_TITLE", + "description": "a structural part of a document mentioned in the example about counting chapters", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "appendices", + "entity_type": "SECTION_TITLE", + "description": "a structural part of a document mentioned in the definition of section filters", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "references", + "entity_type": "SECTION_TITLE", + "description": "a structural part of a document mentioned in the definition of section filters", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "figures", + "entity_type": "IMAGE", + "description": "visual elements mentioned in the example query regarding counting figures", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "data augmentation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "a specific topic discussed in the methodology section in the example query", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "discussion", + "entity_type": "TASK_OR_PROBLEM", + "description": "the content regarding data augmentation that needs to be summarized", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "3 10", + "entity_type": "MEASUREMENT", + "description": "a specific page range mentioned as a filter value in the example query", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "3", + "entity_type": "MEASUREMENT", + "description": "the starting page number in the example range", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "10", + "entity_type": "MEASUREMENT", + "description": "the ending page number in the example range", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "count", + "entity_type": "TASK_OR_PROBLEM", + "description": "an aggregation operation used to count items", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "list", + "entity_type": "TASK_OR_PROBLEM", + "description": "an aggregation operation used to list items", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "summarize", + "entity_type": "TASK_OR_PROBLEM", + "description": "an aggregation operation used to summarize content", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "analyze", + "entity_type": "TASK_OR_PROBLEM", + "description": "an aggregation operation used to analyze content", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "section", + "entity_type": "SECTION_TITLE", + "description": "a filter type used for structural parts like chapters", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "image", + "entity_type": "IMAGE", + "description": "a filter type used for visual elements", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "table", + "entity_type": "TABLE", + "description": "a filter type used for tabular data", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "page", + "entity_type": "MEASUREMENT", + "description": "a filter type used for specific page numbers", + "source_ids": [ + 258 + ] + }, + { + "entity_name": "null", + "entity_type": "TASK_OR_PROBLEM", + "description": "a value indicating that no specific value is provided for image or table filters", + "source_ids": [ + 258 + ] + } + ], + "relations": [ + { + "src_entity_name": "ai assistant", + "tgt_entity_name": "global query", + "relation_name": "", + "weight": 10.0, + "description": "the ai assistant s function is to analyze the global query", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "ai assistant", + "tgt_entity_name": "json object", + "relation_name": "", + "weight": 10.0, + "description": "the ai assistant must return a single valid json object as its output", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "ai assistant", + "tgt_entity_name": "filters", + "relation_name": "", + "weight": 9.0, + "description": "the ai assistant must determine the list of filters to apply", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "ai assistant", + "tgt_entity_name": "operation", + "relation_name": "", + "weight": 9.0, + "description": "the ai assistant must determine the final aggregation operation", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "filters", + "tgt_entity_name": "section", + "relation_name": "", + "weight": 8.0, + "description": "filters can be of type section to target structural parts like chapters or appendices", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "filters", + "tgt_entity_name": "image", + "relation_name": "", + "weight": 8.0, + "description": "filters can be of type image to target visual elements", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "filters", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 8.0, + "description": "filters can be of type table to target tabular data", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "filters", + "tgt_entity_name": "page", + "relation_name": "", + "weight": 8.0, + "description": "filters can be of type page to target specific page numbers", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "operation", + "tgt_entity_name": "count", + "relation_name": "", + "weight": 7.0, + "description": "count is one of the possible operations for aggregation", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "operation", + "tgt_entity_name": "list", + "relation_name": "", + "weight": 7.0, + "description": "list is one of the possible operations for aggregation", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "operation", + "tgt_entity_name": "summarize", + "relation_name": "", + "weight": 7.0, + "description": "summarize is one of the possible operations for aggregation", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "operation", + "tgt_entity_name": "analyze", + "relation_name": "", + "weight": 7.0, + "description": "analyze is one of the possible operations for aggregation", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "paper", + "tgt_entity_name": "figures", + "relation_name": "", + "weight": 6.0, + "description": "the example query asks to count figures in the paper", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "paper", + "tgt_entity_name": "page", + "relation_name": "", + "weight": 6.0, + "description": "the example query specifies a page range 3 to 10 for the paper", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "report", + "tgt_entity_name": "chapter", + "relation_name": "", + "weight": 6.0, + "description": "the example query asks to count chapters in the report", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "methodology", + "tgt_entity_name": "data augmentation", + "relation_name": "", + "weight": 9.0, + "description": "the example query asks to summarize the discussion about data augmentation in the methodology section", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "user", + "tgt_entity_name": "assistant", + "relation_name": "", + "weight": 10.0, + "description": "the user sends a query to the assistant", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "assistant", + "tgt_entity_name": "user", + "relation_name": "", + "weight": 10.0, + "description": "the assistant responds to the user", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "assistant", + "tgt_entity_name": "json object", + "relation_name": "", + "weight": 10.0, + "description": "the assistant must output a json object", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "assistant", + "tgt_entity_name": "filters", + "relation_name": "", + "weight": 9.0, + "description": "the assistant determines the filters to apply", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "assistant", + "tgt_entity_name": "operation", + "relation_name": "", + "weight": 9.0, + "description": "the assistant determines the operation to perform", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "section", + "tgt_entity_name": "chapter", + "relation_name": "", + "weight": 9.0, + "description": "chapters are examples of sections", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "section", + "tgt_entity_name": "appendices", + "relation_name": "", + "weight": 9.0, + "description": "appendices are examples of sections", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "section", + "tgt_entity_name": "references", + "relation_name": "", + "weight": 9.0, + "description": "references are examples of sections", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "image", + "tgt_entity_name": "figures", + "relation_name": "", + "weight": 9.0, + "description": "figures are examples of images", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "page", + "tgt_entity_name": "3 10", + "relation_name": "", + "weight": 8.0, + "description": "3 10 is an example value for a page filter", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "page", + "tgt_entity_name": "3", + "relation_name": "", + "weight": 7.0, + "description": "3 is part of the page range", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "page", + "tgt_entity_name": "10", + "relation_name": "", + "weight": 7.0, + "description": "10 is part of the page range", + "source_ids": [ + 258 + ] + }, + { + "src_entity_name": "discussion", + "tgt_entity_name": "data augmentation", + "relation_name": "", + "weight": 8.0, + "description": "the discussion is about data augmentation", + "source_ids": [ + 258 + ] + } + ], + "node_idx": 258 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_259.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_259.json new file mode 100644 index 0000000..59635df --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_259.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "figure 12", + "entity_type": "IMAGE", + "description": "figure 12 is an image in the text that displays the prompt for filter operator generation", + "source_ids": [ + 259 + ] + }, + { + "entity_name": "filter operator generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "filter operator generation is the specific task or problem for which the prompt in figure 12 is designed", + "source_ids": [ + 259 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 12", + "tgt_entity_name": "filter operator generation", + "relation_name": "", + "weight": 10.0, + "description": "figure 12 contains the prompt used for filter operator generation", + "source_ids": [ + 259 + ] + } + ], + "node_idx": 259 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_26.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_26.json new file mode 100644 index 0000000..6cb04e7 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_26.json @@ -0,0 +1,115 @@ +{ + "entities": [ + { + "entity_name": "agent based retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "agent based retrieval is a proposed approach for dynamically classifying queries and configuring retrieval workflows", + "source_ids": [ + 26 + ] + }, + { + "entity_name": "information foraging theory", + "entity_type": "SCIENTIFIC_THEORY", + "description": "information foraging theory is the theoretical inspiration behind the proposed agent based retrieval approach", + "source_ids": [ + 26 + ] + }, + { + "entity_name": "queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "queries are the inputs that the agent based retrieval approach dynamically classifies", + "source_ids": [ + 26 + ] + }, + { + "entity_name": "retrieval workflows", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval workflows are the processes configured by the approach to locate evidence", + "source_ids": [ + 26 + ] + }, + { + "entity_name": "documents", + "entity_type": "DATASET_OR_CORPUS", + "description": "documents are the source material within which highly relevant evidence is located", + "source_ids": [ + 26 + ] + }, + { + "entity_name": "evidence", + "entity_type": "TASK_OR_PROBLEM", + "description": "evidence refers to the highly relevant information sought within the documents", + "source_ids": [ + 26 + ] + } + ], + "relations": [ + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "information foraging theory", + "relation_name": "", + "weight": 10.0, + "description": "the agent based retrieval approach is explicitly inspired by information foraging theory", + "source_ids": [ + 26 + ] + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "queries", + "relation_name": "", + "weight": 9.0, + "description": "the agent based retrieval approach dynamically classifies queries", + "source_ids": [ + 26 + ] + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "retrieval workflows", + "relation_name": "", + "weight": 9.0, + "description": "the agent based retrieval approach configures optimal retrieval workflows", + "source_ids": [ + 26 + ] + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "documents", + "relation_name": "", + "weight": 8.0, + "description": "the approach operates within documents to locate evidence", + "source_ids": [ + 26 + ] + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "evidence", + "relation_name": "", + "weight": 10.0, + "description": "the goal of the approach is to locate highly relevant evidence within documents", + "source_ids": [ + 26 + ] + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "agent based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "information foraging theory serves as the inspiration for the agent based retrieval approach", + "source_ids": [ + 26 + ] + } + ], + "node_idx": 26 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_260.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_260.json new file mode 100644 index 0000000..19c401c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_260.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "18", + "entity_type": "NUMBER", + "description": "18 is a number mentioned in the text though its specific context or meaning is not provided", + "source_ids": [ + 260 + ] + } + ], + "relations": [], + "node_idx": 260 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_261.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_261.json new file mode 100644 index 0000000..f762b10 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_261.json @@ -0,0 +1,5 @@ +{ + "entities": [], + "relations": [], + "node_idx": 261 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_262.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_262.json new file mode 100644 index 0000000..4b2f904 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_262.json @@ -0,0 +1,217 @@ +{ + "entities": [ + { + "entity_name": "entity resolution adjudicator", + "entity_type": "PERSON", + "description": "entity resolution adjudicator is an expert role tasked with determining if a new entity refers to the same real world concept as candidate entities", + "source_ids": [ + 262 + ] + }, + { + "entity_name": "new entity", + "entity_type": "TASK_OR_PROBLEM", + "description": "new entity is a recently extracted entity from a text that needs to be matched against candidate entities", + "source_ids": [ + 262 + ] + }, + { + "entity_name": "candidate entities", + "entity_type": "TASK_OR_PROBLEM", + "description": "candidate entities are a list of semantically similar entities retrieved from an existing knowledge base for comparison", + "source_ids": [ + 262 + ] + }, + { + "entity_name": "knowledge graph", + "entity_type": "TASK_OR_PROBLEM", + "description": "knowledge graph is the existing database from which candidate entities are retrieved", + "source_ids": [ + 262 + ] + }, + { + "entity_name": "knowledge base", + "entity_type": "TASK_OR_PROBLEM", + "description": "knowledge base is the source of semantically similar candidate entities", + "source_ids": [ + 262 + ] + }, + { + "entity_name": "json object", + "entity_type": "FILE_TYPE", + "description": "json object is the required format for the output containing the id of the matching candidate and an explanation", + "source_ids": [ + 262 + ] + }, + { + "entity_name": "id", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "id is a unique identifier used to reference candidate entities in the output", + "source_ids": [ + 262 + ] + }, + { + "entity_name": "1", + "entity_type": "VALUE", + "description": "1 is a specific value indicating that no matching candidate was found for the new entity", + "source_ids": [ + 262 + ] + }, + { + "entity_name": "text", + "entity_type": "DATASET_OR_CORPUS", + "description": "text is the source material from which the new entity is recently extracted", + "source_ids": [ + 262 + ] + }, + { + "entity_name": "explanation", + "entity_type": "TASK_OR_PROBLEM", + "description": "explanation is a brief description required in the output to justify the decision", + "source_ids": [ + 262 + ] + } + ], + "relations": [ + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "new entity", + "relation_name": "", + "weight": 10.0, + "description": "the entity resolution adjudicator evaluates the new entity to find a match", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "candidate entities", + "relation_name": "", + "weight": 10.0, + "description": "the entity resolution adjudicator compares the new entity against the candidate entities", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "new entity", + "tgt_entity_name": "candidate entities", + "relation_name": "", + "weight": 9.0, + "description": "the new entity is compared against the candidate entities to determine if they refer to the same concept", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "candidate entities", + "tgt_entity_name": "knowledge graph", + "relation_name": "", + "weight": 8.0, + "description": "candidate entities are retrieved from the knowledge graph", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "candidate entities", + "tgt_entity_name": "knowledge base", + "relation_name": "", + "weight": 8.0, + "description": "candidate entities are retrieved from the knowledge base", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "json object", + "relation_name": "", + "weight": 9.0, + "description": "the entity resolution adjudicator must output the result in a json object format", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "id", + "relation_name": "", + "weight": 9.0, + "description": "the entity resolution adjudicator outputs the id of the matching candidate", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 9.0, + "description": "the entity resolution adjudicator outputs 1 if no match is found", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "entity resolution adjudicator", + "tgt_entity_name": "explanation", + "relation_name": "", + "weight": 9.0, + "description": "the entity resolution adjudicator provides an explanation for the decision", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "new entity", + "tgt_entity_name": "text", + "relation_name": "", + "weight": 10.0, + "description": "the new entity is extracted from the text", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "candidate entities", + "tgt_entity_name": "id", + "relation_name": "", + "weight": 10.0, + "description": "each candidate entity has a unique id for reference", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "json object", + "tgt_entity_name": "id", + "relation_name": "", + "weight": 8.0, + "description": "the json object contains the id of the matching candidate", + "source_ids": [ + 262 + ] + }, + { + "src_entity_name": "json object", + "tgt_entity_name": "explanation", + "relation_name": "", + "weight": 8.0, + "description": "the json object contains the explanation for the decision", + "source_ids": [ + 262 + ] + } + ], + "node_idx": 262 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_263.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_263.json new file mode 100644 index 0000000..c001e91 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_263.json @@ -0,0 +1,5 @@ +{ + "entities": [], + "relations": [], + "node_idx": 263 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_264.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_264.json new file mode 100644 index 0000000..a4bc714 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_264.json @@ -0,0 +1,5 @@ +{ + "entities": [], + "relations": [], + "node_idx": 264 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_265.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_265.json new file mode 100644 index 0000000..902e60f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_265.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "new entity", + "entity_type": "TASK_OR_PROBLEM", + "description": "new entity is a task or problem described as needing analysis of its name type and description", + "source_ids": [ + 265 + ] + } + ], + "relations": [], + "node_idx": 265 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_266.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_266.json new file mode 100644 index 0000000..bbe7ed3 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_266.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "field by field adjudication", + "entity_type": "TASK_OR_PROBLEM", + "description": "field by field adjudication is a task described as a method to determine a match by evaluating each field with a specific focus", + "source_ids": [ + 266 + ] + } + ], + "relations": [], + "node_idx": 266 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_267.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_267.json new file mode 100644 index 0000000..ef90589 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_267.json @@ -0,0 +1,91 @@ +{ + "entities": [ + { + "entity_name": "entity name", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity name is a placeholder term used to denote the name of an entity in the context of matching criteria", + "source_ids": [ + 267 + ] + }, + { + "entity_name": "llm", + "entity_type": "TECHNOLOGY", + "description": "llm is mentioned as an abbreviation for large language model in the context of entity name matching", + "source_ids": [ + 267 + ] + }, + { + "entity_name": "large language model", + "entity_type": "TECHNOLOGY", + "description": "large language model is the full form of the abbreviation llm used as an example of a direct abbreviation match", + "source_ids": [ + 267 + ] + }, + { + "entity_name": "event detection", + "entity_type": "TASK_OR_PROBLEM", + "description": "event detection is a task mentioned as a distinct concept that should not be matched with named entity recognition", + "source_ids": [ + 267 + ] + }, + { + "entity_name": "named entity recognition", + "entity_type": "TASK_OR_PROBLEM", + "description": "named entity recognition is a task mentioned as a distinct concept that should not be matched with event detection", + "source_ids": [ + 267 + ] + }, + { + "entity_name": "high importance", + "entity_type": "CONCEPT", + "description": "high importance is a criterion mentioned for determining the similarity of entity names", + "source_ids": [ + 267 + ] + }, + { + "entity_name": "alias", + "entity_type": "CONCEPT", + "description": "alias is a concept mentioned as a valid form of similarity for entity names alongside direct abbreviations", + "source_ids": [ + 267 + ] + }, + { + "entity_name": "distinct concepts", + "entity_type": "CONCEPT", + "description": "distinct concepts refers to parallel concepts that are explicitly excluded from being considered a match", + "source_ids": [ + 267 + ] + } + ], + "relations": [ + { + "src_entity_name": "llm", + "tgt_entity_name": "large language model", + "relation_name": "", + "weight": 10.0, + "description": "llm is a direct abbreviation for large language model", + "source_ids": [ + 267 + ] + }, + { + "src_entity_name": "event detection", + "tgt_entity_name": "named entity recognition", + "relation_name": "", + "weight": 9.0, + "description": "event detection and named entity recognition are distinct parallel concepts and are explicitly stated as not a match", + "source_ids": [ + 267 + ] + } + ], + "node_idx": 267 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_268.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_268.json new file mode 100644 index 0000000..a72f5e0 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_268.json @@ -0,0 +1,25 @@ +{ + "entities": [ + { + "entity_name": "entity type", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity type is a task or problem described as having medium importance in the context of type compatibility", + "source_ids": [ + 268 + ] + } + ], + "relations": [ + { + "src_entity_name": "entity type", + "tgt_entity_name": "entity type", + "relation_name": "", + "weight": 5.0, + "description": "the entity type is described as needing to be closely related and compatible with other types such as company and organization", + "source_ids": [ + 268 + ] + } + ], + "node_idx": 268 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_269.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_269.json new file mode 100644 index 0000000..391000b --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_269.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "description", + "entity_type": "CONCEPT", + "description": "description refers to the contextual importance of text segments which may differ as they are extracted from different parts of a document", + "source_ids": [ + 269 + ] + }, + { + "entity_name": "contextual importance", + "entity_type": "CONCEPT", + "description": "contextual importance is a property of descriptions that requires looking past surface level text similarity to determine if they describe the same underlying object or concept", + "source_ids": [ + 269 + ] + } + ], + "relations": [ + { + "src_entity_name": "description", + "tgt_entity_name": "contextual importance", + "relation_name": "", + "weight": 9.0, + "description": "descriptions possess contextual importance which dictates the need to analyze them for underlying identity rather than surface similarity", + "source_ids": [ + 269 + ] + } + ], + "node_idx": 269 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_27.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_27.json new file mode 100644 index 0000000..5bccbea --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_27.json @@ -0,0 +1,133 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a product that significantly outperforms existing baselines in solving complex document qa tasks", + "source_ids": [ + 27 + ] + }, + { + "entity_name": "existing baselines", + "entity_type": "PRODUCT", + "description": "existing baselines are the current methods or systems that bookrag outperforms in the experiments", + "source_ids": [ + 27 + ] + }, + { + "entity_name": "complex document qa tasks", + "entity_type": "TASK_OR_PROBLEM", + "description": "complex document qa tasks are the specific problems that bookrag is designed to solve", + "source_ids": [ + 27 + ] + }, + { + "entity_name": "extensive experiments", + "entity_type": "EVENT", + "description": "extensive experiments are the tests conducted to evaluate the performance of bookrag", + "source_ids": [ + 27 + ] + }, + { + "entity_name": "multiple benchmarks", + "entity_type": "BENCHMARK", + "description": "multiple benchmarks are the evaluation standards used in the experiments to measure performance", + "source_ids": [ + 27 + ] + }, + { + "entity_name": "state of the art performance", + "entity_type": "EVALUATION_METRIC", + "description": "state of the art performance is the high level of achievement attained by bookrag in the tasks", + "source_ids": [ + 27 + ] + }, + { + "entity_name": "competitive efficiency", + "entity_type": "EVALUATION_METRIC", + "description": "competitive efficiency is a metric indicating that bookrag maintains good efficiency while performing well", + "source_ids": [ + 27 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "existing baselines", + "relation_name": "", + "weight": 9.0, + "description": "bookrag significantly outperforms existing baselines in experiments", + "source_ids": [ + 27 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "complex document qa tasks", + "relation_name": "", + "weight": 10.0, + "description": "bookrag attains state of the art performance in solving complex document qa tasks", + "source_ids": [ + 27 + ] + }, + { + "src_entity_name": "extensive experiments", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "extensive experiments were performed on bookrag to demonstrate its capabilities", + "source_ids": [ + 27 + ] + }, + { + "src_entity_name": "extensive experiments", + "tgt_entity_name": "multiple benchmarks", + "relation_name": "", + "weight": 8.0, + "description": "extensive experiments were conducted on multiple benchmarks to validate results", + "source_ids": [ + 27 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "state of the art performance", + "relation_name": "", + "weight": 10.0, + "description": "bookrag attained state of the art performance as a result of the experiments", + "source_ids": [ + 27 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "competitive efficiency", + "relation_name": "", + "weight": 8.0, + "description": "bookrag maintained competitive efficiency while solving tasks", + "source_ids": [ + 27 + ] + }, + { + "src_entity_name": "multiple benchmarks", + "tgt_entity_name": "state of the art performance", + "relation_name": "", + "weight": 7.0, + "description": "the performance on multiple benchmarks showed state of the art results", + "source_ids": [ + 27 + ] + } + ], + "node_idx": 27 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_270.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_270.json new file mode 100644 index 0000000..fedc16c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_270.json @@ -0,0 +1,25 @@ +{ + "entities": [ + { + "entity_name": "be strict and conservative", + "entity_type": "TASK_OR_PROBLEM", + "description": "be strict and conservative is a guideline or instruction regarding the standard for matching emphasizing high standards to avoid corrupting the knowledge graph", + "source_ids": [ + 270 + ] + } + ], + "relations": [ + { + "src_entity_name": "be strict and conservative", + "tgt_entity_name": "be strict and conservative", + "relation_name": "", + "weight": 1.0, + "description": "the entity refers to itself as a guideline for maintaining high standards in matching", + "source_ids": [ + 270 + ] + } + ], + "node_idx": 270 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_271.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_271.json new file mode 100644 index 0000000..f60b579 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_271.json @@ -0,0 +1,5 @@ +{ + "entities": [], + "relations": [], + "node_idx": 271 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_272.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_272.json new file mode 100644 index 0000000..eee12d7 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_272.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "apple", + "entity_type": "PRODUCT", + "description": "apple is mentioned as an example of a fruit", + "source_ids": [ + 272 + ] + }, + { + "entity_name": "apple inc", + "entity_type": "ORGANIZATION", + "description": "apple inc is mentioned as an example of a company", + "source_ids": [ + 272 + ] + } + ], + "relations": [ + { + "src_entity_name": "apple", + "tgt_entity_name": "apple inc", + "relation_name": "", + "weight": 10.0, + "description": "both are mentioned in the text as examples to illustrate that they are not a match despite sharing the same name", + "source_ids": [ + 272 + ] + } + ], + "node_idx": 272 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_273.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_273.json new file mode 100644 index 0000000..4e125d7 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_273.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "when in doubt", + "entity_type": "TASK_OR_PROBLEM", + "description": "when in doubt is a condition mentioned in the text that triggers a specific output requirement", + "source_ids": [ + 273 + ] + }, + { + "entity_name": "1", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 273 + ] + } + ], + "relations": [ + { + "src_entity_name": "when in doubt", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 10.0, + "description": "the text states that if the condition when in doubt is met the output must be 1", + "source_ids": [ + 273 + ] + } + ], + "node_idx": 273 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_274.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_274.json new file mode 100644 index 0000000..c99d545 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_274.json @@ -0,0 +1,25 @@ +{ + "entities": [ + { + "entity_name": "new entity", + "entity_type": "TASK_OR_PROBLEM", + "description": "new entity is a concept representing a unique entity in a knowledge graph that requires strong evidence to match with existing entities", + "source_ids": [ + 274 + ] + } + ], + "relations": [ + { + "src_entity_name": "new entity", + "tgt_entity_name": "new entity", + "relation_name": "", + "weight": 10.0, + "description": "the concept of new entity is defined by the assumption that it is unique until proven otherwise", + "source_ids": [ + 274 + ] + } + ], + "node_idx": 274 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_275.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_275.json new file mode 100644 index 0000000..0aa2dea --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_275.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "json", + "entity_type": "FILE_TYPE", + "description": "json is a file format mentioned as the required output format for the answer", + "source_ids": [ + 275 + ] + }, + { + "entity_name": "output", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 275 + ] + } + ], + "relations": [ + { + "src_entity_name": "json", + "tgt_entity_name": "output", + "relation_name": "", + "weight": 10.0, + "description": "the text specifies that the answer must be provided in a valid json format", + "source_ids": [ + 275 + ] + } + ], + "node_idx": 275 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_276.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_276.json new file mode 100644 index 0000000..93c863f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_276.json @@ -0,0 +1,115 @@ +{ + "entities": [ + { + "entity_name": "select id", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "select id is a parameter representing an integer id for a candidate determined to be an exact match", + "source_ids": [ + 276 + ] + }, + { + "entity_name": "id", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "id is the identifier of the candidate determined to be an exact match", + "source_ids": [ + 276 + ] + }, + { + "entity_name": "exact match", + "entity_type": "TASK_OR_PROBLEM", + "description": "exact match refers to the condition where a candidate is determined to be identical to a reference", + "source_ids": [ + 276 + ] + }, + { + "entity_name": "1", + "entity_type": "MONEY", + "description": "1 is a specific integer value used to indicate that no exact match was found", + "source_ids": [ + 276 + ] + }, + { + "entity_name": "candidate", + "entity_type": "TASK_OR_PROBLEM", + "description": "candidate refers to an item being evaluated to determine if it is an exact match", + "source_ids": [ + 276 + ] + }, + { + "entity_name": "integer", + "entity_type": "MEASUREMENT", + "description": "integer is the data type specified for the select id value", + "source_ids": [ + 276 + ] + } + ], + "relations": [ + { + "src_entity_name": "select id", + "tgt_entity_name": "id", + "relation_name": "", + "weight": 9.0, + "description": "select id is defined as the integer value of the id of the candidate", + "source_ids": [ + 276 + ] + }, + { + "src_entity_name": "select id", + "tgt_entity_name": "exact match", + "relation_name": "", + "weight": 8.0, + "description": "select id holds the value of the id if an exact match is found", + "source_ids": [ + 276 + ] + }, + { + "src_entity_name": "select id", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 9.0, + "description": "select id is assigned the value 1 if no exact match is found", + "source_ids": [ + 276 + ] + }, + { + "src_entity_name": "select id", + "tgt_entity_name": "candidate", + "relation_name": "", + "weight": 9.0, + "description": "select id represents the id of the candidate being evaluated", + "source_ids": [ + 276 + ] + }, + { + "src_entity_name": "select id", + "tgt_entity_name": "integer", + "relation_name": "", + "weight": 10.0, + "description": "select id is defined as an integer type", + "source_ids": [ + 276 + ] + }, + { + "src_entity_name": "candidate", + "tgt_entity_name": "exact match", + "relation_name": "", + "weight": 8.0, + "description": "the candidate is the subject of the exact match determination", + "source_ids": [ + 276 + ] + } + ], + "node_idx": 276 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_277.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_277.json new file mode 100644 index 0000000..c3a36aa --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_277.json @@ -0,0 +1,25 @@ +{ + "entities": [ + { + "entity_name": "explanation", + "entity_type": "TASK_OR_PROBLEM", + "description": "explanation is a task or problem described as a brief one sentence string explaining reasoning for entity matching", + "source_ids": [ + 277 + ] + } + ], + "relations": [ + { + "src_entity_name": "explanation", + "tgt_entity_name": "explanation", + "relation_name": "", + "weight": 5.0, + "description": "the entity explanation is defined by its role in explaining reasoning for matches or differences between entities", + "source_ids": [ + 277 + ] + } + ], + "node_idx": 277 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_278.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_278.json new file mode 100644 index 0000000..6f21866 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_278.json @@ -0,0 +1,5 @@ +{ + "entities": [], + "relations": [], + "node_idx": 278 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_279.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_279.json new file mode 100644 index 0000000..d101bdc --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_279.json @@ -0,0 +1,5 @@ +{ + "entities": [], + "relations": [], + "node_idx": 279 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_28.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_28.json new file mode 100644 index 0000000..506a9ad --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_28.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "2", + "entity_type": "NUMBER", + "description": "2 is a numerical value appearing in the text though its specific context or meaning is not defined", + "source_ids": [ + 28 + ] + } + ], + "relations": [], + "node_idx": 28 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_280.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_280.json new file mode 100644 index 0000000..14f35b3 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_280.json @@ -0,0 +1,5 @@ +{ + "entities": [], + "relations": [], + "node_idx": 280 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_281.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_281.json new file mode 100644 index 0000000..78907a5 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_281.json @@ -0,0 +1,59 @@ +{ + "entities": [ + { + "entity_name": "select id", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "select id is a parameter or variable defined as an integer in the provided text structure", + "source_ids": [ + 281 + ] + }, + { + "entity_name": "explanation", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "explanation is a parameter or variable defined as a string in the provided text structure", + "source_ids": [ + 281 + ] + }, + { + "entity_name": "example 1", + "entity_type": "TASK_OR_PROBLEM", + "description": "example 1 is a task or problem scenario where a match was found", + "source_ids": [ + 281 + ] + }, + { + "entity_name": "example 2", + "entity_type": "TASK_OR_PROBLEM", + "description": "example 2 is a task or problem scenario where no match was found", + "source_ids": [ + 281 + ] + } + ], + "relations": [ + { + "src_entity_name": "example 1", + "tgt_entity_name": "select id", + "relation_name": "", + "weight": 5.0, + "description": "example 1 is associated with the context of the provided json structure containing select id", + "source_ids": [ + 281 + ] + }, + { + "src_entity_name": "example 2", + "tgt_entity_name": "explanation", + "relation_name": "", + "weight": 5.0, + "description": "example 2 is associated with the context of the provided json structure containing explanation", + "source_ids": [ + 281 + ] + } + ], + "node_idx": 281 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_282.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_282.json new file mode 100644 index 0000000..dce14c2 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_282.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "selection task", + "entity_type": "TASK_OR_PROBLEM", + "description": "the selection task is the activity described in the text that requires processing the provided data", + "source_ids": [ + 282 + ] + }, + { + "entity_name": "integer", + "entity_type": "MEASUREMENT", + "description": "an integer is the specific type of output requested for the selection task", + "source_ids": [ + 282 + ] + } + ], + "relations": [ + { + "src_entity_name": "selection task", + "tgt_entity_name": "integer", + "relation_name": "", + "weight": 9.0, + "description": "the selection task requires the output to be a single integer", + "source_ids": [ + 282 + ] + } + ], + "node_idx": 282 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_283.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_283.json new file mode 100644 index 0000000..60b7ac3 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_283.json @@ -0,0 +1,5 @@ +{ + "entities": [], + "relations": [], + "node_idx": 283 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_284.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_284.json new file mode 100644 index 0000000..c5f4728 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_284.json @@ -0,0 +1,79 @@ +{ + "entities": [ + { + "entity_name": "figure 13", + "entity_type": "IMAGE", + "description": "figure 13 is an image containing a prompt for entity resolution judgement", + "source_ids": [ + 284 + ] + }, + { + "entity_name": "entity resolution", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity resolution is the task or problem addressed by the prompt in figure 13", + "source_ids": [ + 284 + ] + }, + { + "entity_name": "prompt", + "entity_type": "SOFTWARE", + "description": "the prompt is a software component used for entity resolution judgement", + "source_ids": [ + 284 + ] + }, + { + "entity_name": "examples", + "entity_type": "DATASET_OR_CORPUS", + "description": "examples are data instances that were omitted from the text due to space constraints", + "source_ids": [ + 284 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 13", + "tgt_entity_name": "entity resolution", + "relation_name": "", + "weight": 9.0, + "description": "figure 13 contains the prompt used for the entity resolution judgement task", + "source_ids": [ + 284 + ] + }, + { + "src_entity_name": "figure 13", + "tgt_entity_name": "prompt", + "relation_name": "", + "weight": 10.0, + "description": "figure 13 displays the prompt for entity resolution judgement", + "source_ids": [ + 284 + ] + }, + { + "src_entity_name": "prompt", + "tgt_entity_name": "entity resolution", + "relation_name": "", + "weight": 9.0, + "description": "the prompt is specifically designed for the entity resolution judgement task", + "source_ids": [ + 284 + ] + }, + { + "src_entity_name": "examples", + "tgt_entity_name": "figure 13", + "relation_name": "", + "weight": 8.0, + "description": "examples were omitted from figure 13 due to lack of space", + "source_ids": [ + 284 + ] + } + ], + "node_idx": 284 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_285.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_285.json new file mode 100644 index 0000000..2775794 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_285.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "19", + "entity_type": "NUMBER", + "description": "19 is a number mentioned in the text though its specific context or meaning is not provided", + "source_ids": [ + 285 + ] + } + ], + "relations": [], + "node_idx": 285 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_29.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_29.json new file mode 100644 index 0000000..b9e17fd --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_29.json @@ -0,0 +1,265 @@ +{ + "entities": [ + { + "entity_name": "section 2", + "entity_type": "SECTION_TITLE", + "description": "section 2 is the part of the text where related work is reviewed", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "section 3", + "entity_type": "SECTION_TITLE", + "description": "section 3 introduces the problem formulation ift and rag workflow", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "ift", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "ift is a method or technique introduced in section 3 alongside problem formulation and rag workflow", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "rag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "rag is a method or technique introduced in section 3 alongside problem formulation and ift", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "section 4", + "entity_type": "SECTION_TITLE", + "description": "section 4 presents the structure of bookindex and its construction", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "bookindex is a structure presented in section 4 along with its construction details", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a system or product whose structured execution involves query classification and operators discussed in section 5", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "section 6", + "entity_type": "SECTION_TITLE", + "description": "section 6 presents experimental results and detailed analysis", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "section 7", + "entity_type": "SECTION_TITLE", + "description": "section 7 is where the paper concludes", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "section 5", + "entity_type": "SECTION_TITLE", + "description": "section 5 is the part of the text where agent based retrieval is presented", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "query classification", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "query classification is a component of the agent based retrieval elaborated in section 5", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "operators", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "operators are used in the structured execution of bookrag as described in section 5", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "structured execution", + "entity_type": "TASK_OR_PROBLEM", + "description": "structured execution refers to the process in bookrag that utilizes query classification and operators", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "related work", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "experimental results", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 29 + ] + }, + { + "entity_name": "conclusion", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 29 + ] + } + ], + "relations": [ + { + "src_entity_name": "section 2", + "tgt_entity_name": "related work", + "relation_name": "", + "weight": 9.0, + "description": "section 2 is dedicated to reviewing related work", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "section 3", + "tgt_entity_name": "ift", + "relation_name": "", + "weight": 10.0, + "description": "section 3 introduces the ift method", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "section 3", + "tgt_entity_name": "rag", + "relation_name": "", + "weight": 10.0, + "description": "section 3 introduces the rag workflow", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "section 4", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "section 4 presents the structure and construction of bookindex", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "section 5", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "section 5 elaborates on the execution of bookrag", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "section 6", + "tgt_entity_name": "experimental results", + "relation_name": "", + "weight": 10.0, + "description": "section 6 presents the experimental results and analysis", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "section 7", + "tgt_entity_name": "conclusion", + "relation_name": "", + "weight": 10.0, + "description": "section 7 concludes the paper", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "section 5", + "tgt_entity_name": "query classification", + "relation_name": "", + "weight": 10.0, + "description": "section 5 elaborates on query classification as part of agent based retrieval", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "section 5", + "tgt_entity_name": "operators", + "relation_name": "", + "weight": 10.0, + "description": "section 5 describes the operators used in the structured execution of bookrag", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "section 5", + "tgt_entity_name": "structured execution", + "relation_name": "", + "weight": 10.0, + "description": "section 5 presents the structured execution of bookrag", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "structured execution", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is the system undergoing structured execution described in section 5", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "query classification", + "relation_name": "", + "weight": 8.0, + "description": "bookrag utilizes query classification in its execution", + "source_ids": [ + 29 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "operators", + "relation_name": "", + "weight": 8.0, + "description": "bookrag uses operators in its structured execution", + "source_ids": [ + 29 + ] + } + ], + "node_idx": 29 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_3.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_3.json new file mode 100644 index 0000000..f9ca5f0 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_3.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "abstract", + "entity_type": "SECTION_TITLE", + "description": "As the opening section of the paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section provides a concise summary of the research problem, the proposed BookRAG solution involving hierarchical indexing and agent-based querying, and the reported state-of-the-art experimental results.", + "source_ids": [ + 3 + ] + } + ], + "relations": [], + "node_idx": 3 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_30.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_30.json new file mode 100644 index 0000000..7bc4290 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_30.json @@ -0,0 +1,51 @@ +{ + "entities": [ + { + "entity_name": "2 related work", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the main paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section provides a comprehensive review of existing literature, specifically focusing on Retrieval-Augmented Generation (RAG) methods and their limitations regarding hierarchical document structures.", + "source_ids": [ + 30 + ] + }, + { + "entity_name": "retrieval-augmented generation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Refers to the class of techniques discussed in section 2 that enhance Large Language Models by querying external information, serving as the primary context for the related work analysis.", + "source_ids": [ + 30 + ] + }, + { + "entity_name": "hierarchical document structures", + "entity_type": "TASK_OR_PROBLEM", + "description": "Refers to the specific structural characteristics of documents (e.g., books, handbooks) that existing RAG approaches often overlook, which is a key problem addressed in the literature review within section 2.", + "source_ids": [ + 30 + ] + } + ], + "relations": [ + { + "src_entity_name": "retrieval-augmented generation", + "tgt_entity_name": "2 related work", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Retrieval-Augmented Generation' is a primary topic reviewed in section 2.", + "source_ids": [ + 30 + ] + }, + { + "src_entity_name": "hierarchical document structures", + "tgt_entity_name": "2 related work", + "relation_name": "", + "weight": 10.0, + "description": "The challenge of 'Hierarchical Document Structures' is a primary topic reviewed in section 2.", + "source_ids": [ + 30 + ] + } + ], + "node_idx": 30 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_31.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_31.json new file mode 100644 index 0000000..0d71eb4 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_31.json @@ -0,0 +1,89 @@ +{ + "entities": [ + { + "entity_name": "llm", + "entity_type": "TECHNOLOGY", + "description": "llm is a technology mentioned in the context of document analysis", + "source_ids": [ + 31 + ] + }, + { + "entity_name": "rag approaches", + "entity_type": "TECHNOLOGY", + "description": "rag approaches are modern representative technologies reviewed in the text", + "source_ids": [ + 31 + ] + }, + { + "entity_name": "document analysis", + "entity_type": "RESEARCH_FIELD", + "description": "document analysis is the field of study where llms and rag approaches are applied", + "source_ids": [ + 31 + ] + }, + { + "entity_name": "related works", + "entity_type": "SECTION_TITLE", + "description": "related works is the section of the text where the review of llm and rag approaches takes place", + "source_ids": [ + 31 + ] + } + ], + "relations": [ + { + "src_entity_name": "llm", + "tgt_entity_name": "rag approaches", + "relation_name": "", + "weight": 8.0, + "description": "both llm and rag approaches are reviewed together as related works in document analysis", + "source_ids": [ + 31 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "document analysis", + "relation_name": "", + "weight": 9.0, + "description": "llm is used in the field of document analysis", + "source_ids": [ + 31 + ] + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "document analysis", + "relation_name": "", + "weight": 9.0, + "description": "rag approaches are used in the field of document analysis", + "source_ids": [ + 31 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "related works", + "relation_name": "", + "weight": 8.0, + "description": "llm is reviewed within the related works section", + "source_ids": [ + 31 + ] + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "related works", + "relation_name": "", + "weight": 8.0, + "description": "rag approaches are reviewed within the related works section", + "source_ids": [ + 31 + ] + } + ], + "node_idx": 31 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_32.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_32.json new file mode 100644 index 0000000..4d3e7c9 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_32.json @@ -0,0 +1,411 @@ +{ + "entities": [ + { + "entity_name": "llm", + "entity_type": "TECHNOLOGY", + "description": "llm refers to large language models which are used for robust semantic reasoning in document analysis", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "html", + "entity_type": "FILE_TYPE", + "description": "html is an unstructured document format mentioned as a target for conversion into structured formats", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "pdf", + "entity_type": "FILE_TYPE", + "description": "pdf is an unstructured document format mentioned as a target for conversion into structured formats", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "raw text", + "entity_type": "FILE_TYPE", + "description": "raw text is an unstructured document format mentioned as a target for conversion into structured formats", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "relational tables", + "entity_type": "PRODUCT", + "description": "relational tables are structured formats that unstructured documents are converted into", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "evaporate", + "entity_type": "SOFTWARE", + "description": "evaporate is a system that utilizes llms to synthesize extraction code for converting semi structured web documents", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "lotus", + "entity_type": "SOFTWARE", + "description": "lotus is a system that extends the relational model with semantic operators for querying unstructured text corpora", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "docetl", + "entity_type": "SOFTWARE", + "description": "docetl is a system that introduces an agentic framework to optimize complex information extraction tasks", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "sql", + "entity_type": "PROGRAMMING_LANGUAGE", + "description": "sql is a query language referenced in the context of sql like queries executed by lotus", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "web documents", + "entity_type": "PRODUCT", + "description": "web documents are semi structured documents processed by systems like evaporate", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "document pages", + "entity_type": "IMAGE", + "description": "document pages are viewed as images in research to preserve layout and visual information", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "semantic operators", + "entity_type": "TECHNOLOGY", + "description": "semantic operators are features added by lotus to extend the relational model", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "predicates", + "entity_type": "TECHNOLOGY", + "description": "predicates are llm powered functions like filter and join used in lotus", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "filter", + "entity_type": "TECHNOLOGY", + "description": "filter is an example of an llm powered predicate used in lotus", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "join", + "entity_type": "TECHNOLOGY", + "description": "join is an example of an llm powered predicate used in lotus", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "agentic framework", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "an agentic framework is introduced by docetl to optimize information extraction", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "information extraction", + "entity_type": "TASK_OR_PROBLEM", + "description": "information extraction is the complex task optimized by docetl", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "layout", + "entity_type": "CONCEPT", + "description": "layout refers to the visual structure of documents preserved when viewing pages as images", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "visual information", + "entity_type": "CONCEPT", + "description": "visual information refers to the content preserved when document pages are viewed as images", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "semi structured web documents", + "entity_type": "PRODUCT", + "description": "semi structured web documents are the input type for evaporate", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "structured databases", + "entity_type": "PRODUCT", + "description": "structured databases are the output format produced by evaporate", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "manual annotation", + "entity_type": "TASK_OR_PROBLEM", + "description": "manual annotation is a heavy process avoided by evaporate s cost effective conversion", + "source_ids": [ + 32 + ] + }, + { + "entity_name": "unstructured text corpora", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 32 + ] + } + ], + "relations": [ + { + "src_entity_name": "llm", + "tgt_entity_name": "html", + "relation_name": "", + "weight": 9.0, + "description": "llms are used to convert html documents into structured formats", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "pdf", + "relation_name": "", + "weight": 9.0, + "description": "llms are used to convert pdf documents into structured formats", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "raw text", + "relation_name": "", + "weight": 9.0, + "description": "llms are used to convert raw text documents into structured formats", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "relational tables", + "relation_name": "", + "weight": 9.0, + "description": "llms facilitate the conversion of unstructured documents into relational tables", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "evaporate", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 10.0, + "description": "evaporate utilizes llms to synthesize extraction code", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "evaporate", + "tgt_entity_name": "web documents", + "relation_name": "", + "weight": 9.0, + "description": "evaporate converts semi structured web documents into structured databases", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "lotus", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 10.0, + "description": "lotus uses llm powered predicates to execute queries", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "lotus", + "tgt_entity_name": "sql", + "relation_name": "", + "weight": 8.0, + "description": "lotus allows users to execute sql like queries", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "lotus", + "tgt_entity_name": "unstructured text corpora", + "relation_name": "", + "weight": 9.0, + "description": "lotus allows queries to be executed over unstructured text corpora", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "docetl is an llm based system for optimizing information extraction tasks", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "document pages", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "research proposes using llms to analyze document pages viewed as images", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "lotus", + "tgt_entity_name": "semantic operators", + "relation_name": "", + "weight": 10.0, + "description": "lotus extends the relational model with semantic operators", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "lotus", + "tgt_entity_name": "predicates", + "relation_name": "", + "weight": 9.0, + "description": "lotus uses llm powered predicates for querying", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "predicates", + "tgt_entity_name": "filter", + "relation_name": "", + "weight": 8.0, + "description": "filter is an example of a predicate used in lotus", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "predicates", + "tgt_entity_name": "join", + "relation_name": "", + "weight": 8.0, + "description": "join is an example of a predicate used in lotus", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "agentic framework", + "relation_name": "", + "weight": 10.0, + "description": "docetl introduces an agentic framework", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "docetl", + "tgt_entity_name": "information extraction", + "relation_name": "", + "weight": 10.0, + "description": "docetl is designed to optimize complex information extraction tasks", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "document pages", + "tgt_entity_name": "layout", + "relation_name": "", + "weight": 9.0, + "description": "document pages are viewed as images to preserve critical layout information", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "document pages", + "tgt_entity_name": "visual information", + "relation_name": "", + "weight": 9.0, + "description": "document pages are viewed as images to preserve critical visual information", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "evaporate", + "tgt_entity_name": "semi structured web documents", + "relation_name": "", + "weight": 9.0, + "description": "evaporate converts semi structured web documents", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "evaporate", + "tgt_entity_name": "structured databases", + "relation_name": "", + "weight": 9.0, + "description": "evaporate converts documents into structured databases", + "source_ids": [ + 32 + ] + }, + { + "src_entity_name": "evaporate", + "tgt_entity_name": "manual annotation", + "relation_name": "", + "weight": 8.0, + "description": "evaporate avoids the need for heavy manual annotation", + "source_ids": [ + 32 + ] + } + ], + "node_idx": 32 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_33.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_33.json new file mode 100644 index 0000000..0663449 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_33.json @@ -0,0 +1,279 @@ +{ + "entities": [ + { + "entity_name": "rag approaches", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "rag approaches are methods proven to excel in tasks like question answering and data cleaning", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "open ended question answering", + "entity_type": "TASK_OR_PROBLEM", + "description": "open ended question answering is a task where rag approaches have been proven to excel", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "programming context", + "entity_type": "TASK_OR_PROBLEM", + "description": "programming context is a task where rag approaches have been proven to excel", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "sql rewrite", + "entity_type": "TASK_OR_PROBLEM", + "description": "sql rewrite is a task where rag approaches have been proven to excel", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "data cleaning", + "entity_type": "TASK_OR_PROBLEM", + "description": "data cleaning is a task where rag approaches have been proven to excel", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "naive rag technique", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the naive rag technique relies on retrieving query relevant contexts from external knowledge bases to mitigate hallucination", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "llms", + "entity_type": "TECHNOLOGY", + "description": "llms are large language models whose hallucination is mitigated by the naive rag technique", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "graph structures", + "entity_type": "TECHNOLOGY", + "description": "graph structures are adopted by many rag approaches to organize information and relationships within documents", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "agentic rag paradigm", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the agentic rag paradigm employs autonomous agents to dynamically orchestrate and refine the rag pipeline", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "autonomous agents", + "entity_type": "TECHNOLOGY", + "description": "autonomous agents are employed by the agentic rag paradigm to orchestrate and refine the pipeline", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "rag pipeline", + "entity_type": "TASK_OR_PROBLEM", + "description": "the rag pipeline is the process dynamically orchestrated and refined by the agentic rag paradigm", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "recent survey of graph based rag methods", + "entity_type": "PUBLICATION_VENUE", + "description": "a recent survey of graph based rag methods is referenced for more details on the topic", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "rag", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "rag is a technique proven to excel in many tasks including question answering and data cleaning", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "external knowledge bases", + "entity_type": "TECHNOLOGY", + "description": "external knowledge bases are sources from which the naive rag technique retrieves query relevant contexts", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "hallucination", + "entity_type": "TASK_OR_PROBLEM", + "description": "hallucination is a problem in llms that the naive rag technique aims to mitigate", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "overall retrieval performance", + "entity_type": "EVALUATION_METRIC", + "description": "overall retrieval performance is improved by rag approaches that adopt graph structures", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "reasoning robustness", + "entity_type": "EVALUATION_METRIC", + "description": "reasoning robustness is a metric significantly boosted by the agentic rag paradigm", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "generation fidelity", + "entity_type": "EVALUATION_METRIC", + "description": "generation fidelity is a metric significantly boosted by the agentic rag paradigm", + "source_ids": [ + 33 + ] + }, + { + "entity_name": "documents", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 33 + ] + } + ], + "relations": [ + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "open ended question answering", + "relation_name": "", + "weight": 9.0, + "description": "rag approaches excel in open ended question answering", + "source_ids": [ + 33 + ] + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "programming context", + "relation_name": "", + "weight": 9.0, + "description": "rag approaches excel in programming context tasks", + "source_ids": [ + 33 + ] + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "sql rewrite", + "relation_name": "", + "weight": 9.0, + "description": "rag approaches excel in sql rewrite tasks", + "source_ids": [ + 33 + ] + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "data cleaning", + "relation_name": "", + "weight": 9.0, + "description": "rag approaches excel in data cleaning tasks", + "source_ids": [ + 33 + ] + }, + { + "src_entity_name": "naive rag technique", + "tgt_entity_name": "llms", + "relation_name": "", + "weight": 10.0, + "description": "the naive rag technique mitigates the hallucination of llms", + "source_ids": [ + 33 + ] + }, + { + "src_entity_name": "naive rag technique", + "tgt_entity_name": "external knowledge bases", + "relation_name": "", + "weight": 8.0, + "description": "the naive rag technique retrieves query relevant contexts from external knowledge bases", + "source_ids": [ + 33 + ] + }, + { + "src_entity_name": "rag approaches", + "tgt_entity_name": "graph structures", + "relation_name": "", + "weight": 9.0, + "description": "many rag approaches have adopted graph structures to organize information", + "source_ids": [ + 33 + ] + }, + { + "src_entity_name": "graph structures", + "tgt_entity_name": "documents", + "relation_name": "", + "weight": 8.0, + "description": "graph structures organize information and relationships within documents", + "source_ids": [ + 33 + ] + }, + { + "src_entity_name": "agentic rag paradigm", + "tgt_entity_name": "autonomous agents", + "relation_name": "", + "weight": 10.0, + "description": "the agentic rag paradigm employs autonomous agents to orchestrate the pipeline", + "source_ids": [ + 33 + ] + }, + { + "src_entity_name": "agentic rag paradigm", + "tgt_entity_name": "rag pipeline", + "relation_name": "", + "weight": 10.0, + "description": "the agentic rag paradigm dynamically orchestrates and refines the rag pipeline", + "source_ids": [ + 33 + ] + }, + { + "src_entity_name": "agentic rag paradigm", + "tgt_entity_name": "reasoning robustness", + "relation_name": "", + "weight": 9.0, + "description": "the agentic rag paradigm significantly boosts reasoning robustness", + "source_ids": [ + 33 + ] + }, + { + "src_entity_name": "agentic rag paradigm", + "tgt_entity_name": "generation fidelity", + "relation_name": "", + "weight": 9.0, + "description": "the agentic rag paradigm significantly boosts generation fidelity", + "source_ids": [ + 33 + ] + } + ], + "node_idx": 33 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_34.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_34.json new file mode 100644 index 0000000..55a0a11 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_34.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "3 preliminaries", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the main paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section establishes the foundational concepts, definitions, and theoretical background necessary to understand the proposed BookRAG method and its context within Retrieval-Augmented Generation (RAG) for hierarchical documents.", + "source_ids": [ + 34 + ] + } + ], + "relations": [], + "node_idx": 34 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_35.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_35.json new file mode 100644 index 0000000..8ef037f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_35.json @@ -0,0 +1,123 @@ +{ + "entities": [ + { + "entity_name": "complex document qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "complex document qa is the research problem being formalized in the text", + "source_ids": [ + 35 + ] + }, + { + "entity_name": "information foraging theory", + "entity_type": "SCIENTIFIC_THEORY", + "description": "information foraging theory ift is a foundational theory introduced in the text", + "source_ids": [ + 35 + ] + }, + { + "entity_name": "ift", + "entity_type": "SCIENTIFIC_THEORY", + "description": "ift is an abbreviation for information foraging theory a foundational theory introduced in the text", + "source_ids": [ + 35 + ] + }, + { + "entity_name": "rag systems", + "entity_type": "TECHNOLOGY", + "description": "rag systems are a type of technology whose general workflow is reviewed in the text", + "source_ids": [ + 35 + ] + }, + { + "entity_name": "research problem", + "entity_type": "TASK_OR_PROBLEM", + "description": "the research problem is the subject being formalized in the text", + "source_ids": [ + 35 + ] + }, + { + "entity_name": "general workflow", + "entity_type": "TASK_OR_PROBLEM", + "description": "the general workflow of rag systems is the subject being briefly reviewed in the text", + "source_ids": [ + 35 + ] + }, + { + "entity_name": "section", + "entity_type": "SECTION_TITLE", + "description": "the section is the part of the document that contains the formalization and review described in the text", + "source_ids": [ + 35 + ] + } + ], + "relations": [ + { + "src_entity_name": "complex document qa", + "tgt_entity_name": "information foraging theory", + "relation_name": "", + "weight": 8.0, + "description": "the text states that the research problem of complex document qa is formalized alongside the introduction of information foraging theory", + "source_ids": [ + 35 + ] + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "ift", + "relation_name": "", + "weight": 10.0, + "description": "ift is the abbreviation used for information foraging theory in the text", + "source_ids": [ + 35 + ] + }, + { + "src_entity_name": "rag systems", + "tgt_entity_name": "complex document qa", + "relation_name": "", + "weight": 6.0, + "description": "the text mentions reviewing the workflow of rag systems in the context of formalizing the research problem of complex document qa", + "source_ids": [ + 35 + ] + }, + { + "src_entity_name": "section", + "tgt_entity_name": "research problem", + "relation_name": "", + "weight": 9.0, + "description": "the section is the location where the research problem is formalized", + "source_ids": [ + 35 + ] + }, + { + "src_entity_name": "section", + "tgt_entity_name": "general workflow", + "relation_name": "", + "weight": 9.0, + "description": "the section is the location where the general workflow is reviewed", + "source_ids": [ + 35 + ] + }, + { + "src_entity_name": "research problem", + "tgt_entity_name": "general workflow", + "relation_name": "", + "weight": 7.0, + "description": "both the research problem and the general workflow are discussed within the same section", + "source_ids": [ + 35 + ] + } + ], + "node_idx": 35 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_36.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_36.json new file mode 100644 index 0000000..ad70587 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_36.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "3.1 problem formulation", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Preliminaries' within the BookRAG paper, this section formalizes the research problem of complex document Question Answering (QA) and establishes the foundational context for the proposed approach.", + "source_ids": [ + 36 + ] + }, + { + "entity_name": "complex document qa", + "entity_type": "TASK_OR_PROBLEM", + "description": "Refers to the specific challenge of answering questions based on complex documents, which is the core research problem being formalized in section 3.1.", + "source_ids": [ + 36 + ] + } + ], + "relations": [ + { + "src_entity_name": "complex document qa", + "tgt_entity_name": "3.1 problem formulation", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Complex Document QA' is the primary topic and subject of the problem formulation detailed in section 3.1.", + "source_ids": [ + 36 + ] + } + ], + "node_idx": 36 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_37.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_37.json new file mode 100644 index 0000000..dc6c266 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_37.json @@ -0,0 +1,451 @@ +{ + "entities": [ + { + "entity_name": "question answering", + "entity_type": "TASK_OR_PROBLEM", + "description": "question answering is a task aimed at answering user queries based on long form documents", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "document", + "entity_type": "PRODUCT", + "description": "a document is represented as a sequence of pages containing content blocks organized within a logical chapter hierarchy", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "user query", + "entity_type": "TASK_OR_PROBLEM", + "description": "a user query is an input provided to the system to generate an accurate answer", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "answer", + "entity_type": "TASK_OR_PROBLEM", + "description": "an answer is the output generated by the system ideally grounded in specific evidence blocks", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "evidence blocks", + "entity_type": "DATASET_OR_CORPUS", + "description": "evidence blocks are a specific set of content blocks from the document used to ground the generated answer", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "method s", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "method s is a developed approach that maps a structured document and a query to a final answer", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "pages", + "entity_type": "MEASUREMENT", + "description": "pages are the units that collectively form a document represented as a sequence", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "content blocks", + "entity_type": "DATASET_OR_CORPUS", + "description": "content blocks are distinct elements within a document such as text segments section headers tables or images", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "text segment", + "entity_type": "DATASET_OR_CORPUS", + "description": "a text segment is a type of content block within a document", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "section header", + "entity_type": "DATASET_OR_CORPUS", + "description": "a section header is a type of content block within a document", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "table", + "entity_type": "DATASET_OR_CORPUS", + "description": "a table is a type of content block within a document", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "image", + "entity_type": "DATASET_OR_CORPUS", + "description": "an image is a type of content block within a document", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "logical chapter hierarchy", + "entity_type": "TASK_OR_PROBLEM", + "description": "a logical chapter hierarchy is the organizational structure within which content blocks are arranged", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "n is a variable representing the number of pages in a document", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "m", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "m is a variable representing the number of content blocks in a document", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "p", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "p is a variable representing a specific page within a document sequence", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "q", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "q is a variable representing a user query", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "a", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "a is a variable representing the generated answer", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "e", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "e is a variable representing a subset of evidence blocks", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "b", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "b is a variable representing the sequence of all content blocks", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "d", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "d is a variable representing the document", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "equation 1", + "entity_type": "EQUATION_OR_FORMULA", + "description": "equation 1 is the mathematical formulation defining the task as a s d q", + "source_ids": [ + 37 + ] + }, + { + "entity_name": "references 5 11 33", + "entity_type": "PUBLICATION_VENUE", + "description": "references 5 11 and 33 are citations mentioned in the text regarding the problem of question answering", + "source_ids": [ + 37 + ] + } + ], + "relations": [ + { + "src_entity_name": "question answering", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 10.0, + "description": "question answering aims to answer queries based on documents", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "question answering", + "tgt_entity_name": "user query", + "relation_name": "", + "weight": 10.0, + "description": "question answering processes user queries to generate answers", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "question answering", + "tgt_entity_name": "answer", + "relation_name": "", + "weight": 10.0, + "description": "the goal of question answering is to generate an accurate answer", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "answer", + "tgt_entity_name": "evidence blocks", + "relation_name": "", + "weight": 9.0, + "description": "an answer is ideally grounded in a specific set of evidence blocks", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "method s", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 10.0, + "description": "method s maps a structured document to a final answer", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "method s", + "tgt_entity_name": "user query", + "relation_name": "", + "weight": 10.0, + "description": "method s maps a user query to a final answer", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "method s", + "tgt_entity_name": "answer", + "relation_name": "", + "weight": 10.0, + "description": "method s produces the final answer", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "document", + "tgt_entity_name": "pages", + "relation_name": "", + "weight": 10.0, + "description": "a document is represented as a sequence of pages", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "document", + "tgt_entity_name": "content blocks", + "relation_name": "", + "weight": 10.0, + "description": "pages in a document collectively contain a sequence of content blocks", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "text segment", + "relation_name": "", + "weight": 8.0, + "description": "a text segment is an example of a content block", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "section header", + "relation_name": "", + "weight": 8.0, + "description": "a section header is an example of a content block", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 8.0, + "description": "a table is an example of a content block", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "image", + "relation_name": "", + "weight": 8.0, + "description": "an image is an example of a content block", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "logical chapter hierarchy", + "relation_name": "", + "weight": 9.0, + "description": "content blocks are organized within a logical chapter hierarchy", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "document", + "tgt_entity_name": "n", + "relation_name": "", + "weight": 9.0, + "description": "n defines the sequence length of pages in the document", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "document", + "tgt_entity_name": "m", + "relation_name": "", + "weight": 9.0, + "description": "m defines the sequence length of content blocks in the document", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "document", + "tgt_entity_name": "p", + "relation_name": "", + "weight": 9.0, + "description": "p represents an individual page within the document sequence", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "document", + "tgt_entity_name": "b", + "relation_name": "", + "weight": 9.0, + "description": "b represents an individual content block within the document", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "method s", + "tgt_entity_name": "equation 1", + "relation_name": "", + "weight": 10.0, + "description": "method s is mathematically defined by equation 1", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "question answering", + "tgt_entity_name": "references 5 11 33", + "relation_name": "", + "weight": 8.0, + "description": "the problem of question answering is associated with references 5 11 and 33", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "answer", + "tgt_entity_name": "a", + "relation_name": "", + "weight": 10.0, + "description": "a is the variable symbol for the answer", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "user query", + "tgt_entity_name": "q", + "relation_name": "", + "weight": 10.0, + "description": "q is the variable symbol for the user query", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "evidence blocks", + "tgt_entity_name": "e", + "relation_name": "", + "weight": 10.0, + "description": "e is the variable symbol for the set of evidence blocks", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "b", + "relation_name": "", + "weight": 10.0, + "description": "b is the variable symbol for the sequence of content blocks", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "pages", + "tgt_entity_name": "p", + "relation_name": "", + "weight": 10.0, + "description": "p is the variable symbol for pages", + "source_ids": [ + 37 + ] + }, + { + "src_entity_name": "document", + "tgt_entity_name": "d", + "relation_name": "", + "weight": 10.0, + "description": "d is the variable symbol for the document", + "source_ids": [ + 37 + ] + } + ], + "node_idx": 37 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_38.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_38.json new file mode 100644 index 0000000..109d12a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_38.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "3", + "entity_type": "MEASUREMENT", + "description": "3 is a numerical value mentioned in the text potentially representing a count index or measurement", + "source_ids": [ + 38 + ] + } + ], + "relations": [], + "node_idx": 38 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_39.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_39.json new file mode 100644 index 0000000..d5c9d15 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_39.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (1)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the variable A as a function of D and q. LaTeX: 𝐴 = S( 𝐷,𝑞 ) (1)", + "source_ids": [ + 39 + ] + } + ], + "relations": [], + "node_idx": 39 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_4.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_4.json new file mode 100644 index 0000000..2578ef2 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_4.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "pvldb", + "entity_type": "PUBLICATION_VENUE", + "description": "pvldb is a publication venue referenced in the text for its reference format", + "source_ids": [ + 4 + ] + }, + { + "entity_name": "reference format", + "entity_type": "SECTION_TITLE", + "description": "reference format is a section or concept mentioned in the context of pvldb", + "source_ids": [ + 4 + ] + } + ], + "relations": [ + { + "src_entity_name": "pvldb", + "tgt_entity_name": "reference format", + "relation_name": "", + "weight": 9.0, + "description": "pvldb is associated with a specific reference format mentioned in the text", + "source_ids": [ + 4 + ] + } + ], + "node_idx": 4 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_40.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_40.json new file mode 100644 index 0000000..73081aa --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_40.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "s", + "entity_type": "PERSON", + "description": "s is an entity described as needing to navigate sequential page content and logical hierarchy to synthesize a response", + "source_ids": [ + 40 + ] + }, + { + "entity_name": "d", + "entity_type": "TASK_OR_PROBLEM", + "description": "d represents the logical hierarchy that s must navigate to synthesize a response", + "source_ids": [ + 40 + ] + } + ], + "relations": [ + { + "src_entity_name": "s", + "tgt_entity_name": "d", + "relation_name": "", + "weight": 9.0, + "description": "s must navigate the logical hierarchy of d to synthesize the response", + "source_ids": [ + 40 + ] + } + ], + "node_idx": 40 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_41.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_41.json new file mode 100644 index 0000000..10932d8 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_41.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "3.2 information foraging theory", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Preliminaries' within the BookRAG paper, this section formalizes the foundational Information Foraging Theory (IFT) used to model user behavior in complex document QA tasks.", + "source_ids": [ + 41 + ] + }, + { + "entity_name": "information foraging theory", + "entity_type": "SCIENTIFIC_THEORY", + "description": "A theoretical framework explaining how individuals seek information efficiently, serving as the conceptual basis for the system's design discussed in section 3.2.", + "source_ids": [ + 41 + ] + } + ], + "relations": [ + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "3.2 information foraging theory", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Information Foraging Theory' is the primary subject matter detailed in section 3.2.", + "source_ids": [ + 41 + ] + } + ], + "node_idx": 41 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_42.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_42.json new file mode 100644 index 0000000..72ff46d --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_42.json @@ -0,0 +1,159 @@ +{ + "entities": [ + { + "entity_name": "information foraging theory", + "entity_type": "SCIENTIFIC_THEORY", + "description": "information foraging theory ift is a framework for understanding information access as a process analogous to animal foraging", + "source_ids": [ + 42 + ] + }, + { + "entity_name": "animal foraging", + "entity_type": "TASK_OR_PROBLEM", + "description": "animal foraging is the process used as an analogy to explain how users access information in the context of information foraging theory", + "source_ids": [ + 42 + ] + }, + { + "entity_name": "information scent", + "entity_type": "CONCEPT", + "description": "information scent refers to cues like keywords or icons that users follow to navigate content", + "source_ids": [ + 42 + ] + }, + { + "entity_name": "information patches", + "entity_type": "CONCEPT", + "description": "information patches are clusters of content such as sections in handbooks that users navigate between", + "source_ids": [ + 42 + ] + }, + { + "entity_name": "handbooks", + "entity_type": "PRODUCT", + "description": "handbooks are mentioned as containing sections that serve as information patches", + "source_ids": [ + 42 + ] + }, + { + "entity_name": "keywords", + "entity_type": "CONCEPT", + "description": "keywords are identified as specific examples of information scent cues used by users", + "source_ids": [ + 42 + ] + }, + { + "entity_name": "icons", + "entity_type": "CONCEPT", + "description": "icons are identified as specific examples of information scent cues used by users", + "source_ids": [ + 42 + ] + }, + { + "entity_name": "sections", + "entity_type": "CONCEPT", + "description": "sections are described as parts of handbooks that function as information patches", + "source_ids": [ + 42 + ] + }, + { + "entity_name": "reference 42", + "entity_type": "PUBLICATION_VENUE", + "description": "reference 42 is the citation source for information foraging theory mentioned in the text", + "source_ids": [ + 42 + ] + } + ], + "relations": [ + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "animal foraging", + "relation_name": "", + "weight": 10.0, + "description": "information foraging theory uses animal foraging as an analogy to explain information access", + "source_ids": [ + 42 + ] + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "information scent", + "relation_name": "", + "weight": 9.0, + "description": "information foraging theory suggests that users follow information scent cues to navigate content", + "source_ids": [ + 42 + ] + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 9.0, + "description": "information foraging theory describes information patches as clusters of content that users navigate between", + "source_ids": [ + 42 + ] + }, + { + "src_entity_name": "information scent", + "tgt_entity_name": "handbooks", + "relation_name": "", + "weight": 7.0, + "description": "information scent cues like keywords or icons are found within sections of handbooks which act as information patches", + "source_ids": [ + 42 + ] + }, + { + "src_entity_name": "information scent", + "tgt_entity_name": "keywords", + "relation_name": "", + "weight": 10.0, + "description": "keywords are explicitly listed as examples of information scent", + "source_ids": [ + 42 + ] + }, + { + "src_entity_name": "information scent", + "tgt_entity_name": "icons", + "relation_name": "", + "weight": 10.0, + "description": "icons are explicitly listed as examples of information scent", + "source_ids": [ + 42 + ] + }, + { + "src_entity_name": "information patches", + "tgt_entity_name": "sections", + "relation_name": "", + "weight": 10.0, + "description": "sections in handbooks are explicitly listed as examples of information patches", + "source_ids": [ + 42 + ] + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "reference 42", + "relation_name": "", + "weight": 8.0, + "description": "information foraging theory is cited with reference number 42 in the text", + "source_ids": [ + 42 + ] + } + ], + "node_idx": 42 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_43.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_43.json new file mode 100644 index 0000000..40bd26d --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_43.json @@ -0,0 +1,179 @@ +{ + "entities": [ + { + "entity_name": "experts", + "entity_type": "PERSON", + "description": "experts are individuals seeking a solution to a specific problem within a large technical handbook", + "source_ids": [ + 43 + ] + }, + { + "entity_name": "large technical handbook", + "entity_type": "BOOK", + "description": "the large technical handbook is the source material containing the problem and information patches", + "source_ids": [ + 43 + ] + }, + { + "entity_name": "key terms", + "entity_type": "CONCEPT", + "description": "key terms are extracted by experts to act as information scent", + "source_ids": [ + 43 + ] + }, + { + "entity_name": "information scent", + "entity_type": "CONCEPT", + "description": "information scent is the guidance provided by key terms that leads experts to promising sections", + "source_ids": [ + 43 + ] + }, + { + "entity_name": "information patches", + "entity_type": "CONCEPT", + "description": "information patches are the promising sections within the handbook that experts navigate to", + "source_ids": [ + 43 + ] + }, + { + "entity_name": "final answer", + "entity_type": "CONCEPT", + "description": "the final answer is the result formulated by experts after analyzing the content within the information patches", + "source_ids": [ + 43 + ] + }, + { + "entity_name": "problem", + "entity_type": "TASK_OR_PROBLEM", + "description": "a specific problem is the target issue that experts are seeking to solve within the handbook", + "source_ids": [ + 43 + ] + }, + { + "entity_name": "diverse content", + "entity_type": "CONCEPT", + "description": "diverse content refers to the varied information found within the information patches that experts analyze", + "source_ids": [ + 43 + ] + }, + { + "entity_name": "precise knowledge", + "entity_type": "CONCEPT", + "description": "precise knowledge is the specific information extracted from the diverse content to help formulate the answer", + "source_ids": [ + 43 + ] + } + ], + "relations": [ + { + "src_entity_name": "experts", + "tgt_entity_name": "large technical handbook", + "relation_name": "", + "weight": 10.0, + "description": "experts seek a solution within the large technical handbook", + "source_ids": [ + 43 + ] + }, + { + "src_entity_name": "experts", + "tgt_entity_name": "key terms", + "relation_name": "", + "weight": 9.0, + "description": "experts extract key terms from the handbook", + "source_ids": [ + 43 + ] + }, + { + "src_entity_name": "key terms", + "tgt_entity_name": "information scent", + "relation_name": "", + "weight": 10.0, + "description": "key terms act as information scent", + "source_ids": [ + 43 + ] + }, + { + "src_entity_name": "information scent", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 9.0, + "description": "information scent guides experts to navigate towards information patches", + "source_ids": [ + 43 + ] + }, + { + "src_entity_name": "experts", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 9.0, + "description": "experts navigate to and analyze content within information patches", + "source_ids": [ + 43 + ] + }, + { + "src_entity_name": "experts", + "tgt_entity_name": "final answer", + "relation_name": "", + "weight": 10.0, + "description": "experts formulate a final answer based on the analysis of information patches", + "source_ids": [ + 43 + ] + }, + { + "src_entity_name": "experts", + "tgt_entity_name": "problem", + "relation_name": "", + "weight": 10.0, + "description": "experts are seeking a solution to the specific problem", + "source_ids": [ + 43 + ] + }, + { + "src_entity_name": "experts", + "tgt_entity_name": "diverse content", + "relation_name": "", + "weight": 9.0, + "description": "experts analyze the diverse content within the information patches", + "source_ids": [ + 43 + ] + }, + { + "src_entity_name": "diverse content", + "tgt_entity_name": "precise knowledge", + "relation_name": "", + "weight": 9.0, + "description": "experts extract precise knowledge from the diverse content", + "source_ids": [ + 43 + ] + }, + { + "src_entity_name": "precise knowledge", + "tgt_entity_name": "final answer", + "relation_name": "", + "weight": 10.0, + "description": "precise knowledge is used to formulate the final answer", + "source_ids": [ + 43 + ] + } + ], + "node_idx": 43 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_44.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_44.json new file mode 100644 index 0000000..baf505a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_44.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "3.3 rag workflow", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Preliminaries' within the BookRAG paper, this section details the general operational workflow of Retrieval-Augmented Generation (RAG) systems, serving as a foundational context for the proposed hierarchical approach.", + "source_ids": [ + 44 + ] + }, + { + "entity_name": "rag systems", + "entity_type": "TECHNOLOGY", + "description": "Refers to Retrieval-Augmented Generation systems, which are the subject of the workflow analysis detailed in section 3.3.", + "source_ids": [ + 44 + ] + } + ], + "relations": [ + { + "src_entity_name": "rag systems", + "tgt_entity_name": "3.3 rag workflow", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'RAG systems' is the primary topic and subject matter of section 3.3.", + "source_ids": [ + 44 + ] + } + ], + "node_idx": 44 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_45.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_45.json new file mode 100644 index 0000000..571e378 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_45.json @@ -0,0 +1,213 @@ +{ + "entities": [ + { + "entity_name": "retrieval augmented generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval augmented generation is a system framework described as operating in a two phase process", + "source_ids": [ + 45 + ] + }, + { + "entity_name": "offline indexing phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "offline indexing phase is the first phase where unstructured corpus data is organized into a structured index", + "source_ids": [ + 45 + ] + }, + { + "entity_name": "online retrieval phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "online retrieval phase is the second phase where the system retrieves relevant components based on a user query", + "source_ids": [ + 45 + ] + }, + { + "entity_name": "vector databases", + "entity_type": "SOFTWARE", + "description": "vector databases are mentioned as a possible form of structured index in the offline indexing phase", + "source_ids": [ + 45 + ] + }, + { + "entity_name": "kg", + "entity_type": "SOFTWARE", + "description": "kg knowledge graph is mentioned as a possible form of structured index in the offline indexing phase", + "source_ids": [ + 45 + ] + }, + { + "entity_name": "llm", + "entity_type": "SOFTWARE", + "description": "llm is the component that generates output informed by retrieved components in the online retrieval phase", + "source_ids": [ + 45 + ] + }, + { + "entity_name": "user query", + "entity_type": "TASK_OR_PROBLEM", + "description": "user query is the input used in the online retrieval phase to retrieve relevant components", + "source_ids": [ + 45 + ] + }, + { + "entity_name": "document s native tree topology", + "entity_type": "TASK_OR_PROBLEM", + "description": "document s native tree topology is the logical structure that the proposed approach seeks to integrate with retrieval structures", + "source_ids": [ + 45 + ] + }, + { + "entity_name": "unstructured corpus data", + "entity_type": "DATASET_OR_CORPUS", + "description": "unstructured corpus data is the input material organized into a structured index during the offline indexing phase", + "source_ids": [ + 45 + ] + }, + { + "entity_name": "text chunks", + "entity_type": "DATASET_OR_CORPUS", + "description": "text chunks are examples of relevant components retrieved during the online retrieval phase", + "source_ids": [ + 45 + ] + }, + { + "entity_name": "subgraphs", + "entity_type": "DATASET_OR_CORPUS", + "description": "subgraphs are examples of relevant components retrieved during the online retrieval phase", + "source_ids": [ + 45 + ] + }, + { + "entity_name": "document", + "entity_type": "TASK_OR_PROBLEM", + "description": "the document is the source of the original logical hierarchy and native tree topology referenced in the text", + "source_ids": [ + 45 + ] + } + ], + "relations": [ + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "offline indexing phase", + "relation_name": "", + "weight": 10.0, + "description": "retrieval augmented generation systems operate in the offline indexing phase as their first step", + "source_ids": [ + 45 + ] + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "online retrieval phase", + "relation_name": "", + "weight": 10.0, + "description": "retrieval augmented generation systems operate in the online retrieval phase as their second step", + "source_ids": [ + 45 + ] + }, + { + "src_entity_name": "offline indexing phase", + "tgt_entity_name": "vector databases", + "relation_name": "", + "weight": 9.0, + "description": "vector databases are a form of structured index created during the offline indexing phase", + "source_ids": [ + 45 + ] + }, + { + "src_entity_name": "offline indexing phase", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "kg is a form of structured index created during the offline indexing phase", + "source_ids": [ + 45 + ] + }, + { + "src_entity_name": "online retrieval phase", + "tgt_entity_name": "user query", + "relation_name": "", + "weight": 10.0, + "description": "the online retrieval phase uses the user query to retrieve relevant components", + "source_ids": [ + 45 + ] + }, + { + "src_entity_name": "online retrieval phase", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "the online retrieval phase informs the llm s generation with retrieved components", + "source_ids": [ + 45 + ] + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "document s native tree topology", + "relation_name": "", + "weight": 8.0, + "description": "the proposed approach for retrieval augmented generation seeks to integrate retrieval structures with the document s native tree topology", + "source_ids": [ + 45 + ] + }, + { + "src_entity_name": "offline indexing phase", + "tgt_entity_name": "unstructured corpus data", + "relation_name": "", + "weight": 10.0, + "description": "the offline indexing phase organizes unstructured corpus data into a structured index", + "source_ids": [ + 45 + ] + }, + { + "src_entity_name": "online retrieval phase", + "tgt_entity_name": "text chunks", + "relation_name": "", + "weight": 9.0, + "description": "text chunks are retrieved as relevant components during the online retrieval phase", + "source_ids": [ + 45 + ] + }, + { + "src_entity_name": "online retrieval phase", + "tgt_entity_name": "subgraphs", + "relation_name": "", + "weight": 9.0, + "description": "subgraphs are retrieved as relevant components during the online retrieval phase", + "source_ids": [ + 45 + ] + }, + { + "src_entity_name": "retrieval augmented generation", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "the retrieval augmented generation approach seeks to integrate structures with the document s native topology", + "source_ids": [ + 45 + ] + } + ], + "node_idx": 45 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_46.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_46.json new file mode 100644 index 0000000..97560b2 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_46.json @@ -0,0 +1,69 @@ +{ + "entities": [ + { + "entity_name": "4 bookindex", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the main paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section details the novel index structure named BookIndex. It explains how the approach extracts a hierarchical tree from documents to serve as a table of contents, utilizes graphs to capture entity relationships, and maps entities to tree nodes.", + "source_ids": [ + 46 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "A novel index structure introduced in this work that builds a hierarchical tree from documents to act as a table of contents and uses graphs to capture intricate relationships between entities.", + "source_ids": [ + 46 + ] + }, + { + "entity_name": "hierarchical tree", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The structural method used within BookIndex to organize document content from different granularity levels, serving the role of a table of contents.", + "source_ids": [ + 46 + ] + }, + { + "entity_name": "graph", + "entity_type": "TECHNOLOGY", + "description": "The data structure employed by BookIndex to capture and represent the intricate relationships between entities within the document hierarchy.", + "source_ids": [ + 46 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookindex", + "tgt_entity_name": "4 bookindex", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'BookIndex' is the primary subject defined and detailed in section 4.", + "source_ids": [ + 46 + ] + }, + { + "src_entity_name": "hierarchical tree", + "tgt_entity_name": "4 bookindex", + "relation_name": "", + "weight": 9.5, + "description": "The 'hierarchical tree' is a core component and technique described within section 4 as part of the BookIndex implementation.", + "source_ids": [ + 46 + ] + }, + { + "src_entity_name": "graph", + "tgt_entity_name": "4 bookindex", + "relation_name": "", + "weight": 9.5, + "description": "The use of a 'graph' to capture entity relations is a key technical detail explained in section 4.", + "source_ids": [ + 46 + ] + } + ], + "node_idx": 46 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_47.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_47.json new file mode 100644 index 0000000..f4cfbaa --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_47.json @@ -0,0 +1,169 @@ +{ + "entities": [ + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "bookindex is a proposed hierarchical structure aware index designed to capture logical hierarchy and entity relations within complex documents", + "source_ids": [ + 47 + ] + }, + { + "entity_name": "tree construction", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "tree construction is the first stage of the two stage process that parses document layout to establish hierarchical nodes", + "source_ids": [ + 47 + ] + }, + { + "entity_name": "graph construction", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "graph construction is the second stage of the process that extracts fine grained entity knowledge and refines it", + "source_ids": [ + 47 + ] + }, + { + "entity_name": "gradient based entity resolution method", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "a novel gradient based entity resolution method used to refine entity knowledge during graph construction", + "source_ids": [ + 47 + ] + }, + { + "entity_name": "document", + "entity_type": "PRODUCT", + "description": "document refers to the complex documents within which the bookindex captures logical hierarchy and entity relations", + "source_ids": [ + 47 + ] + }, + { + "entity_name": "logical hierarchy", + "entity_type": "CONCEPT", + "description": "logical hierarchy is the explicit structure within complex documents that the bookindex is designed to capture", + "source_ids": [ + 47 + ] + }, + { + "entity_name": "entity relations", + "entity_type": "CONCEPT", + "description": "entity relations are the intricate connections within complex documents that the bookindex is designed to capture", + "source_ids": [ + 47 + ] + }, + { + "entity_name": "hierarchical nodes", + "entity_type": "CONCEPT", + "description": "hierarchical nodes are the categorized units established by the tree construction process", + "source_ids": [ + 47 + ] + }, + { + "entity_name": "fine grained entity knowledge", + "entity_type": "CONCEPT", + "description": "fine grained entity knowledge is the detailed information extracted from tree nodes during the graph construction process", + "source_ids": [ + 47 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookindex", + "tgt_entity_name": "tree construction", + "relation_name": "", + "weight": 9.0, + "description": "bookindex utilizes tree construction as its first stage to parse document layout and establish hierarchical nodes", + "source_ids": [ + 47 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "graph construction", + "relation_name": "", + "weight": 9.0, + "description": "bookindex utilizes graph construction as its second stage to extract and refine entity knowledge", + "source_ids": [ + 47 + ] + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "gradient based entity resolution method", + "relation_name": "", + "weight": 8.0, + "description": "graph construction refines entity knowledge using the novel gradient based entity resolution method", + "source_ids": [ + 47 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "bookindex is designed to operate on complex documents to capture their internal structures", + "source_ids": [ + 47 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "logical hierarchy", + "relation_name": "", + "weight": 10.0, + "description": "bookindex is explicitly designed to capture the explicit logical hierarchy found in documents", + "source_ids": [ + 47 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "entity relations", + "relation_name": "", + "weight": 10.0, + "description": "bookindex is explicitly designed to capture the intricate entity relations found in documents", + "source_ids": [ + 47 + ] + }, + { + "src_entity_name": "tree construction", + "tgt_entity_name": "hierarchical nodes", + "relation_name": "", + "weight": 9.0, + "description": "tree construction parses document layout to establish hierarchical nodes", + "source_ids": [ + 47 + ] + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "fine grained entity knowledge", + "relation_name": "", + "weight": 9.0, + "description": "graph construction extracts fine grained entity knowledge from tree nodes", + "source_ids": [ + 47 + ] + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "hierarchical nodes", + "relation_name": "", + "weight": 8.0, + "description": "graph construction operates on the tree nodes established by tree construction to extract knowledge", + "source_ids": [ + 47 + ] + } + ], + "node_idx": 47 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_48.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_48.json new file mode 100644 index 0000000..d641f89 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_48.json @@ -0,0 +1,131 @@ +{ + "entities": [ + { + "entity_name": "figure 2", + "entity_type": "IMAGE", + "description": "figure 2 is an image illustrating the bookindex construction process", + "source_ids": [ + 48 + ] + }, + { + "entity_name": "bookindex construction process", + "entity_type": "TASK_OR_PROBLEM", + "description": "the bookindex construction process is a phase involving tree construction and graph construction", + "source_ids": [ + 48 + ] + }, + { + "entity_name": "tree construction", + "entity_type": "TASK_OR_PROBLEM", + "description": "tree construction is a component of the bookindex construction process derived from layout parsing and section filtering", + "source_ids": [ + 48 + ] + }, + { + "entity_name": "layout parsing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "layout parsing is a method used to derive tree construction", + "source_ids": [ + 48 + ] + }, + { + "entity_name": "section filtering", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "section filtering is a method used to derive tree construction", + "source_ids": [ + 48 + ] + }, + { + "entity_name": "graph construction", + "entity_type": "TASK_OR_PROBLEM", + "description": "graph construction is a component of the bookindex construction process involving kg construction and gradient based entity resolution", + "source_ids": [ + 48 + ] + }, + { + "entity_name": "kg construction", + "entity_type": "TASK_OR_PROBLEM", + "description": "kg construction is a step involved in graph construction", + "source_ids": [ + 48 + ] + }, + { + "entity_name": "gradient based entity resolution", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "gradient based entity resolution is a method involved in graph construction", + "source_ids": [ + 48 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookindex construction process", + "tgt_entity_name": "tree construction", + "relation_name": "", + "weight": 10.0, + "description": "the bookindex construction process includes tree construction as a phase", + "source_ids": [ + 48 + ] + }, + { + "src_entity_name": "bookindex construction process", + "tgt_entity_name": "graph construction", + "relation_name": "", + "weight": 10.0, + "description": "the bookindex construction process includes graph construction as a phase", + "source_ids": [ + 48 + ] + }, + { + "src_entity_name": "tree construction", + "tgt_entity_name": "layout parsing", + "relation_name": "", + "weight": 9.0, + "description": "tree construction is derived from layout parsing", + "source_ids": [ + 48 + ] + }, + { + "src_entity_name": "tree construction", + "tgt_entity_name": "section filtering", + "relation_name": "", + "weight": 9.0, + "description": "tree construction is derived from section filtering", + "source_ids": [ + 48 + ] + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "kg construction", + "relation_name": "", + "weight": 9.0, + "description": "graph construction involves kg construction", + "source_ids": [ + 48 + ] + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "gradient based entity resolution", + "relation_name": "", + "weight": 9.0, + "description": "graph construction involves gradient based entity resolution", + "source_ids": [ + 48 + ] + } + ], + "node_idx": 48 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_49.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_49.json new file mode 100644 index 0000000..61d567d --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_49.json @@ -0,0 +1,357 @@ +{ + "entities": [ + { + "entity_name": "bookindex construction", + "entity_type": "IMAGE", + "description": "A diagram illustrating the process of constructing a book index, divided into Tree Construction and Graph Construction phases.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "tree construction", + "entity_type": "TASK_OR_PROBLEM", + "description": "The top section of the diagram detailing the initial phase of building the index from document layouts.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "layout parsing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Step 1 in Tree Construction involving the extraction of visual elements like Tables, Text, Titles, and Images from a document layout.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "section filtering", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Step 2 in Tree Construction where parsed sections are filtered based on title properties such as FontSize and content type (Section vs Text).", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "title: method", + "entity_type": "SECTION_TITLE", + "description": "A specific text label identified during parsing with FontSize 14.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "title: experiment", + "entity_type": "SECTION_TITLE", + "description": "A specific text label identified during parsing with FontSize 14.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "title: moe layer", + "entity_type": "SECTION_TITLE", + "description": "A specific text label identified during parsing with FontSize 20.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "level: 2 type: section", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "An attribute indicating that 'Method' and 'Experiment' titles are classified as Level 2 Sections.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "level: none type: text", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "An attribute associated with 'MOE Layer', marked with a red cross, indicating it was rejected or not treated as a section.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "tree node", + "entity_type": "HARDWARE", + "description": "Visual element representing nodes in the tree structure shown in the legend and the resulting BookIndex.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "gt-link", + "entity_type": "SOFTWARE", + "description": "Legend item representing Ground Truth links between entities in the diagram.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "relation", + "entity_type": "DATASET_OR_CORPUS", + "description": "Legend item representing relationships between entities.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "graph construction", + "entity_type": "TASK_OR_PROBLEM", + "description": "The bottom section of the diagram detailing the construction of a knowledge graph for entity resolution.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "kg construction", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Step 1 in Graph Construction showing the generation of a Knowledge Graph from Tree Nodes.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "gradient-based entity resolution", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Step 2 in Graph Construction involving similarity matching and merging of entities.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "similarity", + "entity_type": "EVALUATION_METRIC", + "description": "Y-axis label of the chart in the Gradient-based Entity Resolution step.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "entity", + "entity_type": "DATASET_OR_CORPUS", + "description": "X-axis label of the chart in the Gradient-based Entity Resolution step.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "merge", + "entity_type": "TASK_OR_PROBLEM", + "description": "Action performed to combine similar entities (e.g., e2 and e9) into a single resolved entity.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "The final output data structure shown on the far right, containing the organized tree and graph representation.", + "source_ids": [ + 49 + ] + }, + { + "entity_name": "image cref='#/texts/52'", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 49 + ] + } + ], + "relations": [ + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "bookindex construction", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to BookIndex Construction", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "tree construction", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Tree Construction", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "layout parsing", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Layout Parsing", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "section filtering", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Section Filtering", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "title: method", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Title: Method", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "title: experiment", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Title: Experiment", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "title: moe layer", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Title: MOE Layer", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "level: 2 type: section", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Level: 2 Type: Section", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "level: none type: text", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Level: None Type: Text", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "tree node", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Tree Node", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "gt-link", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to GT-Link", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "entity", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Entity", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "relation", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Relation", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "graph construction", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Graph Construction", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "kg construction", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to KG Construction", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "gradient-based entity resolution", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Gradient-based Entity Resolution", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "similarity", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Similarity", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "merge", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to Merge", + "source_ids": [ + 49 + ] + }, + { + "src_entity_name": "image cref='#/texts/52'", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/52' related to BookIndex", + "source_ids": [ + 49 + ] + } + ], + "node_idx": 49 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_5.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_5.json new file mode 100644 index 0000000..677a953 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_5.json @@ -0,0 +1,261 @@ +{ + "entities": [ + { + "entity_name": "shu wang", + "entity_type": "PERSON", + "description": "shu wang is one of the authors of the paper titled bookrag", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "yingli zhou", + "entity_type": "PERSON", + "description": "yingli zhou is one of the authors of the paper titled bookrag", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "yixiang fang", + "entity_type": "PERSON", + "description": "yixiang fang is one of the authors of the paper titled bookrag", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a hierarchical structure aware index based approach for retrieval augmented generation on complex documents", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "pvldb", + "entity_type": "PUBLICATION_VENUE", + "description": "pvldb is the publication venue where the paper was published in 2025", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "2025", + "entity_type": "DATE", + "description": "2025 is the year the paper was published in pvldb", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "19", + "entity_type": "MEASUREMENT", + "description": "19 is the volume number of the pvldb publication where the paper appeared", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "1", + "entity_type": "MEASUREMENT", + "description": "1 is the issue number of the pvldb publication where the paper appeared", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "xxx xxx", + "entity_type": "MEASUREMENT", + "description": "xxx xxx represents the page range of the paper in the publication", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "xx xx xxx xx", + "entity_type": "MEASUREMENT", + "description": "xx xx xxx xx is the doi identifier for the paper", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "retrieval augmented generation", + "entity_type": "TECHNOLOGY", + "description": "retrieval augmented generation is the technology domain addressed by the bookrag approach", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "hierarchical structure aware index based approach", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "hierarchical structure aware index based approach is the specific method used by bookrag", + "source_ids": [ + 5 + ] + }, + { + "entity_name": "complex documents", + "entity_type": "TASK_OR_PROBLEM", + "description": "complex documents are the type of documents that the bookrag approach is designed to handle", + "source_ids": [ + 5 + ] + } + ], + "relations": [ + { + "src_entity_name": "shu wang", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "shu wang is an author of the bookrag paper", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "yingli zhou", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "yingli zhou is an author of the bookrag paper", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "yixiang fang", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "yixiang fang is an author of the bookrag paper", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "pvldb", + "relation_name": "", + "weight": 10.0, + "description": "bookrag was published in the pvldb journal", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "2025", + "relation_name": "", + "weight": 9.0, + "description": "pvldb published the bookrag paper in the year 2025", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "shu wang", + "tgt_entity_name": "yingli zhou", + "relation_name": "", + "weight": 8.0, + "description": "shu wang and yingli zhou are co authors on the bookrag paper", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "shu wang", + "tgt_entity_name": "yixiang fang", + "relation_name": "", + "weight": 8.0, + "description": "shu wang and yixiang fang are co authors on the bookrag paper", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "yingli zhou", + "tgt_entity_name": "yixiang fang", + "relation_name": "", + "weight": 8.0, + "description": "yingli zhou and yixiang fang are co authors on the bookrag paper", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "retrieval augmented generation", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is an approach for retrieval augmented generation", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "hierarchical structure aware index based approach", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is defined as a hierarchical structure aware index based approach", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "complex documents", + "relation_name": "", + "weight": 9.0, + "description": "bookrag is designed for processing complex documents", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "19", + "relation_name": "", + "weight": 8.0, + "description": "pvldb volume 19 contains the paper", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 8.0, + "description": "pvldb issue 1 contains the paper", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "xxx xxx", + "relation_name": "", + "weight": 8.0, + "description": "the paper appears on pages xxx xxx in pvldb", + "source_ids": [ + 5 + ] + }, + { + "src_entity_name": "pvldb", + "tgt_entity_name": "xx xx xxx xx", + "relation_name": "", + "weight": 8.0, + "description": "the paper in pvldb has the doi xx xx xxx xx", + "source_ids": [ + 5 + ] + } + ], + "node_idx": 5 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_50.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_50.json new file mode 100644 index 0000000..0da54f0 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_50.json @@ -0,0 +1,69 @@ +{ + "entities": [ + { + "entity_name": "4.1 overview of bookindex", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'BOOKINDEX', this section provides a high-level introduction to the proposed BookIndex, defining its hierarchical structure-aware nature and outlining its two-stage construction process (Tree Construction and Graph Construction) for capturing logical hierarchies and entity relations in complex documents.", + "source_ids": [ + 50 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "A hierarchical structure-aware index designed to capture explicit logical hierarchy and intricate entity relations within complex documents, serving as the core subject of section 4.1.", + "source_ids": [ + 50 + ] + }, + { + "entity_name": "tree construction", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The first stage of the BookIndex construction process, which parses document layout to establish hierarchical nodes categorized by type, detailed in section 4.1.", + "source_ids": [ + 50 + ] + }, + { + "entity_name": "graph construction", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The second stage of the BookIndex construction process, which extracts fine-grained entity knowledge from tree nodes and refines it using gradient-based entity resolution, detailed in section 4.1.", + "source_ids": [ + 50 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookindex", + "tgt_entity_name": "4.1 overview of bookindex", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'BookIndex' is the primary topic defined and introduced in section 4.1.", + "source_ids": [ + 50 + ] + }, + { + "src_entity_name": "tree construction", + "tgt_entity_name": "4.1 overview of bookindex", + "relation_name": "", + "weight": 9.5, + "description": "The method 'Tree Construction' is a key component of the overview provided in section 4.1.", + "source_ids": [ + 50 + ] + }, + { + "src_entity_name": "graph construction", + "tgt_entity_name": "4.1 overview of bookindex", + "relation_name": "", + "weight": 9.5, + "description": "The method 'Graph Construction' is a key component of the overview provided in section 4.1.", + "source_ids": [ + 50 + ] + } + ], + "node_idx": 50 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_51.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_51.json new file mode 100644 index 0000000..106fc3a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_51.json @@ -0,0 +1,381 @@ +{ + "entities": [ + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "bookindex is a formally defined triplet structure used to represent document hierarchy and entities", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "tree structure", + "entity_type": "TASK_OR_PROBLEM", + "description": "tree structure represents the set of nodes derived from the document s explicit logical hierarchy", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "knowledge graph", + "entity_type": "SOFTWARE", + "description": "knowledge graph is a structure that captures fine grained entities and their relations within the document", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "graph tree link", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "graph tree link gt link is a mechanism that links entities to specific tree nodes from which they were extracted", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "document", + "entity_type": "PRODUCT", + "description": "document is the source material containing logical hierarchy entities and relations", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "titles", + "entity_type": "SECTION_TITLE", + "description": "titles are examples of nodes in the document s explicit logical hierarchy", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "sections", + "entity_type": "SECTION_TITLE", + "description": "sections are examples of nodes in the document s explicit logical hierarchy", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "tables", + "entity_type": "TABLE", + "description": "tables are examples of nodes in the document s explicit logical hierarchy", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "n represents the set of nodes in the tree structure", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "e t", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "e t denotes the nesting relationships in the tree structure", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "v", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "v represents the fine grained entities in the knowledge graph", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "e g", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "e g denotes the relations in the knowledge graph", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "m v", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "m v is the graph tree link function linking entities to tree nodes", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "p", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "p represents the power set of nodes in the tree structure used in the graph tree link definition", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "information patches", + "entity_type": "CONCEPT", + "description": "information patches are hierarchical tree nodes serving as native contexts for information seeking", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "information scent", + "entity_type": "CONCEPT", + "description": "information scent is the rich information provided by entities and relations to guide navigation", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 51 + ] + }, + { + "entity_name": "navigation", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 51 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookindex", + "tgt_entity_name": "tree structure", + "relation_name": "", + "weight": 10.0, + "description": "bookindex is defined as a triplet that includes the tree structure as one of its components", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "knowledge graph", + "relation_name": "", + "weight": 10.0, + "description": "bookindex is defined as a triplet that includes the knowledge graph as one of its components", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "graph tree link", + "relation_name": "", + "weight": 10.0, + "description": "bookindex is defined as a triplet that includes the graph tree link as one of its components", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "the tree structure is derived from the document s explicit logical hierarchy", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "knowledge graph", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "the knowledge graph captures entities and relations scattered throughout the document", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "graph tree link", + "tgt_entity_name": "tree structure", + "relation_name": "", + "weight": 10.0, + "description": "the graph tree link connects entities to specific tree nodes within the tree structure", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "graph tree link", + "tgt_entity_name": "knowledge graph", + "relation_name": "", + "weight": 10.0, + "description": "the graph tree link connects entities from the knowledge graph to the tree structure", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "titles", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "titles are part of the document s explicit logical hierarchy", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "sections", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "sections are part of the document s explicit logical hierarchy", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "tables", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 8.0, + "description": "tables are part of the document s explicit logical hierarchy", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "titles", + "relation_name": "", + "weight": 9.0, + "description": "titles are examples of nodes included in the tree structure", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "sections", + "relation_name": "", + "weight": 9.0, + "description": "sections are examples of nodes included in the tree structure", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "tables", + "relation_name": "", + "weight": 9.0, + "description": "tables are examples of nodes included in the tree structure", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "knowledge graph", + "tgt_entity_name": "v", + "relation_name": "", + "weight": 10.0, + "description": "v represents the entities contained within the knowledge graph", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "knowledge graph", + "tgt_entity_name": "e g", + "relation_name": "", + "weight": 10.0, + "description": "e g represents the relations contained within the knowledge graph", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "n", + "relation_name": "", + "weight": 10.0, + "description": "n represents the set of nodes contained within the tree structure", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "e t", + "relation_name": "", + "weight": 10.0, + "description": "e t represents the nesting relationships contained within the tree structure", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "", + "relation_name": "", + "weight": 10.0, + "description": "is the first component of the bookindex triplet definition", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "", + "tgt_entity_name": "", + "relation_name": "", + "weight": 10.0, + "description": "is the set of nodes that constitutes the tree structure", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "", + "tgt_entity_name": "p", + "relation_name": "", + "weight": 10.0, + "description": "maps entities to the power set of nodes p", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "tree structure", + "tgt_entity_name": "information patches", + "relation_name": "", + "weight": 9.0, + "description": "the hierarchical tree nodes in the tree structure serve as information patches", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "knowledge graph", + "tgt_entity_name": "information scent", + "relation_name": "", + "weight": 9.0, + "description": "the entities and relations in the knowledge graph act as information scent", + "source_ids": [ + 51 + ] + }, + { + "src_entity_name": "information scent", + "tgt_entity_name": "navigation", + "relation_name": "", + "weight": 8.0, + "description": "information scent guides navigation between and within information patches", + "source_ids": [ + 51 + ] + } + ], + "node_idx": 51 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_52.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_52.json new file mode 100644 index 0000000..93616cc --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_52.json @@ -0,0 +1,289 @@ +{ + "entities": [ + { + "entity_name": "figure 2", + "entity_type": "IMAGE", + "description": "figure 2 is an image that provides an example of the bookindex", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "bookindex is a system or product being illustrated in the text", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "tree component", + "entity_type": "SOFTWARE", + "description": "the tree component is a part of the bookindex that organizes documents into a hierarchical structure", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "graph component", + "entity_type": "SOFTWARE", + "description": "the graph component is a part of the bookindex composed of entities and relations extracted from document nodes", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "gt link", + "entity_type": "TECHNOLOGY", + "description": "gt link is a feature illustrated by blue dotted lines that connects entities to their corresponding tree nodes", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "text", + "entity_type": "PRODUCT", + "description": "text is a type of content block serving as a leaf node within the document structure", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "tables", + "entity_type": "PRODUCT", + "description": "tables are a type of content block serving as a leaf node within the document structure", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "images", + "entity_type": "PRODUCT", + "description": "images are a type of content block serving as a leaf node within the document structure", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "section nodes", + "entity_type": "PRODUCT", + "description": "section nodes are hierarchical nodes within the document structure that contain content blocks", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "document", + "entity_type": "PRODUCT", + "description": "document is the object being organized into a hierarchical structure by the tree component", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "content blocks", + "entity_type": "PRODUCT", + "description": "content blocks are the items text tables images that serve as leaf nodes in the hierarchy", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "leaf nodes", + "entity_type": "PRODUCT", + "description": "leaf nodes are the terminal elements in the hierarchical structure containing content blocks", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "semantic entities", + "entity_type": "CONCEPT", + "description": "semantic entities are the extracted entities grounded within the document s logical hierarchy by gt link", + "source_ids": [ + 52 + ] + }, + { + "entity_name": "logical hierarchy", + "entity_type": "CONCEPT", + "description": "logical hierarchy is the structure within the document that grounds the semantic entities", + "source_ids": [ + 52 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 2", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "figure 2 provides an example of the bookindex", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "tree component", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "the tree component is a part of the bookindex", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "graph component", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "the graph component is a part of the bookindex", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "gt link", + "tgt_entity_name": "graph component", + "relation_name": "", + "weight": 8.0, + "description": "gt link is a feature within the graph component that connects entities to tree nodes", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "tree component", + "tgt_entity_name": "section nodes", + "relation_name": "", + "weight": 8.0, + "description": "the tree component organizes content blocks within section nodes", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "text", + "tgt_entity_name": "section nodes", + "relation_name": "", + "weight": 7.0, + "description": "text serves as a leaf node nested within section nodes", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "tables", + "tgt_entity_name": "section nodes", + "relation_name": "", + "weight": 7.0, + "description": "tables serve as a leaf node nested within section nodes", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "images", + "tgt_entity_name": "section nodes", + "relation_name": "", + "weight": 7.0, + "description": "images serve as a leaf node nested within section nodes", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "gt link", + "tgt_entity_name": "tree component", + "relation_name": "", + "weight": 8.0, + "description": "gt link connects entities back to their corresponding tree nodes", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "tree component", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 9.0, + "description": "the tree component organizes the document into a hierarchical structure", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "leaf nodes", + "relation_name": "", + "weight": 9.0, + "description": "content blocks serve as leaf nodes within the structure", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "text", + "tgt_entity_name": "content blocks", + "relation_name": "", + "weight": 10.0, + "description": "text is identified as a type of content block", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "tables", + "tgt_entity_name": "content blocks", + "relation_name": "", + "weight": 10.0, + "description": "tables are identified as a type of content block", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "images", + "tgt_entity_name": "content blocks", + "relation_name": "", + "weight": 10.0, + "description": "images are identified as a type of content block", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "gt link", + "tgt_entity_name": "semantic entities", + "relation_name": "", + "weight": 9.0, + "description": "gt link explicitly connects semantic entities back to their corresponding tree nodes", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "semantic entities", + "tgt_entity_name": "logical hierarchy", + "relation_name": "", + "weight": 8.0, + "description": "semantic entities are grounded within the document s logical hierarchy", + "source_ids": [ + 52 + ] + }, + { + "src_entity_name": "graph component", + "tgt_entity_name": "semantic entities", + "relation_name": "", + "weight": 8.0, + "description": "the graph component is composed of entities and relations extracted from nodes which include semantic entities", + "source_ids": [ + 52 + ] + } + ], + "node_idx": 52 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_53.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_53.json new file mode 100644 index 0000000..9d9294d --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_53.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "4.2 tree construction", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'BookIndex' and the first stage of its construction process, this section details the method for parsing document layouts to establish hierarchical nodes categorized by type.", + "source_ids": [ + 53 + ] + }, + { + "entity_name": "tree construction", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The specific sequential process described in section 4.2 that parses document layout to create hierarchical nodes.", + "source_ids": [ + 53 + ] + } + ], + "relations": [ + { + "src_entity_name": "tree construction", + "tgt_entity_name": "4.2 tree construction", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Tree Construction' is the primary topic and methodology detailed in section 4.2.", + "source_ids": [ + 53 + ] + } + ], + "node_idx": 53 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_54.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_54.json new file mode 100644 index 0000000..84e487c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_54.json @@ -0,0 +1,97 @@ +{ + "entities": [ + { + "entity_name": "t", + "entity_type": "TASK_OR_PROBLEM", + "description": "t is a structured hierarchical tree that is the result of transforming a raw document", + "source_ids": [ + 54 + ] + }, + { + "entity_name": "raw document", + "entity_type": "PRODUCT", + "description": "raw document is the initial input that undergoes transformation into a structured hierarchical tree", + "source_ids": [ + 54 + ] + }, + { + "entity_name": "robust layout parsing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "robust layout parsing is a key step involved in transforming the raw document into a structured hierarchical tree", + "source_ids": [ + 54 + ] + }, + { + "entity_name": "intelligent section filtering", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "intelligent section filtering is a key step involved in transforming the raw document into a structured hierarchical tree", + "source_ids": [ + 54 + ] + }, + { + "entity_name": "task or problem", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 54 + ] + } + ], + "relations": [ + { + "src_entity_name": "t", + "tgt_entity_name": "task or problem", + "relation_name": "", + "weight": 5.0, + "description": "t represents the structured hierarchical tree which is the outcome of the transformation task described", + "source_ids": [ + 54 + ] + }, + { + "src_entity_name": "robust layout parsing", + "tgt_entity_name": "t", + "relation_name": "", + "weight": 9.0, + "description": "robust layout parsing is a step used to create the structured hierarchical tree t", + "source_ids": [ + 54 + ] + }, + { + "src_entity_name": "intelligent section filtering", + "tgt_entity_name": "t", + "relation_name": "", + "weight": 9.0, + "description": "intelligent section filtering is a step used to create the structured hierarchical tree t", + "source_ids": [ + 54 + ] + }, + { + "src_entity_name": "raw document", + "tgt_entity_name": "robust layout parsing", + "relation_name": "", + "weight": 8.0, + "description": "raw document is the input processed by the robust layout parsing step", + "source_ids": [ + 54 + ] + }, + { + "src_entity_name": "raw document", + "tgt_entity_name": "intelligent section filtering", + "relation_name": "", + "weight": 8.0, + "description": "raw document is the input processed by the intelligent section filtering step", + "source_ids": [ + 54 + ] + } + ], + "node_idx": 54 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_55.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_55.json new file mode 100644 index 0000000..4045a24 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_55.json @@ -0,0 +1,87 @@ +{ + "entities": [ + { + "entity_name": "4.2.1 layout parsing", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Tree Construction' within the 'BOOKINDEX' chapter, this section details the initial phase of transforming raw documents into structured hierarchical trees using layout analysis and recognition models to identify and organize diverse content blocks.", + "source_ids": [ + 55 + ] + }, + { + "entity_name": "layout analysis", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A specific technique employed in section 4.2.1 to understand the spatial arrangement of elements within document pages.", + "source_ids": [ + 55 + ] + }, + { + "entity_name": "recognition models", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The computational models utilized in section 4.2.1 to recognize and classify different types of content blocks such as text, tables, and images.", + "source_ids": [ + 55 + ] + }, + { + "entity_name": "document d", + "entity_type": "TASK_OR_PROBLEM", + "description": "The input data object (a collection of pages) that serves as the target for processing in section 4.2.1.", + "source_ids": [ + 55 + ] + }, + { + "entity_name": "content blocks", + "entity_type": "DATASET_OR_CORPUS", + "description": "The diverse structural units (e.g., text, tables, images) identified and extracted from the document pages as described in section 4.2.1.", + "source_ids": [ + 55 + ] + } + ], + "relations": [ + { + "src_entity_name": "layout analysis", + "tgt_entity_name": "4.2.1 layout parsing", + "relation_name": "", + "weight": 9.5, + "description": "Layout Analysis is a core methodological component discussed within section 4.2.1.", + "source_ids": [ + 55 + ] + }, + { + "src_entity_name": "recognition models", + "tgt_entity_name": "4.2.1 layout parsing", + "relation_name": "", + "weight": 9.5, + "description": "Recognition Models are the primary tools used in the process detailed in section 4.2.1.", + "source_ids": [ + 55 + ] + }, + { + "src_entity_name": "document d", + "tgt_entity_name": "4.2.1 layout parsing", + "relation_name": "", + "weight": 10.0, + "description": "Document D is the specific input entity being processed in section 4.2.1.", + "source_ids": [ + 55 + ] + }, + { + "src_entity_name": "content blocks", + "tgt_entity_name": "4.2.1 layout parsing", + "relation_name": "", + "weight": 10.0, + "description": "Content Blocks represent the output entities identified and organized within section 4.2.1.", + "source_ids": [ + 55 + ] + } + ], + "node_idx": 55 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_56.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_56.json new file mode 100644 index 0000000..30c98a2 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_56.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "the output", + "entity_type": "TASK_OR_PROBLEM", + "description": "the output is described as a sequence of primitive", + "source_ids": [ + 56 + ] + }, + { + "entity_name": "primitive", + "entity_type": "CONCEPT", + "description": "primitive is a term used to describe the components of the output sequence", + "source_ids": [ + 56 + ] + } + ], + "relations": [ + { + "src_entity_name": "the output", + "tgt_entity_name": "primitive", + "relation_name": "", + "weight": 9.0, + "description": "the output consists of a sequence of primitives", + "source_ids": [ + 56 + ] + } + ], + "node_idx": 56 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_57.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_57.json new file mode 100644 index 0000000..40377d7 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_57.json @@ -0,0 +1,315 @@ +{ + "entities": [ + { + "entity_name": "section filtering", + "entity_type": "TASK_OR_PROBLEM", + "description": "section filtering is a phase that processes an initial sequence to identify a document s logically hierarchical structure", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "layout parsing", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "layout parsing is a method that identifies blocks as title but does not assign their hierarchical level", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "llm", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "llm is a model used for analysis to determine hierarchical levels and node types of document candidates", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "title", + "entity_type": "SECTION_TITLE", + "description": "title refers to blocks identified by layout parsing that require hierarchical level assignment", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "text", + "entity_type": "SECTION_TITLE", + "description": "text is a node type used to re classify erroneous title blocks such as descriptive text within images", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "image", + "entity_type": "IMAGE", + "description": "image refers to a location within a document where descriptive text might be erroneously parsed as a title", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "table", + "entity_type": "TABLE", + "description": "table refers to a document element specifically borderless table headers that might be erroneously parsed as a title", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "b", + "entity_type": "DATASET_OR_CORPUS", + "description": "b represents the candidate subset of blocks selected for llm based analysis", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "c", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "c represents the content of the candidates analyzed by the llm", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "f", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "f represents the layout features of the candidates analyzed by the llm", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "4 2 2", + "entity_type": "SECTION_TITLE", + "description": "4 2 2 is the section identifier for the section filtering phase", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "b title", + "entity_type": "DATASET_OR_CORPUS", + "description": "b title is a candidate subset of blocks where the type is title selected for llm based analysis", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "l", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "l represents the actual hierarchical level of a block ranging from 1 to infinity", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "1", + "entity_type": "MEASUREMENT", + "description": "1 is the value assigned to the root level in the hierarchical structure", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "none", + "entity_type": "SECTION_TITLE", + "description": "none is a value indicating that a block has no hierarchical level", + "source_ids": [ + 57 + ] + }, + { + "entity_name": "", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 57 + ] + } + ], + "relations": [ + { + "src_entity_name": "section filtering", + "tgt_entity_name": "layout parsing", + "relation_name": "", + "weight": 9.0, + "description": "section filtering processes the output of layout parsing to identify hierarchical structure", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 10.0, + "description": "section filtering utilizes an llm to analyze content and layout features of candidates", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "layout parsing", + "tgt_entity_name": "title", + "relation_name": "", + "weight": 8.0, + "description": "layout parsing identifies blocks as title", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "title", + "relation_name": "", + "weight": 9.0, + "description": "llm analyzes title candidates to determine their actual hierarchical level and final node type", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "text", + "relation_name": "", + "weight": 8.0, + "description": "llm may re classify erroneous title blocks as text", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "image", + "relation_name": "", + "weight": 7.0, + "description": "section filtering aims to correct blocks erroneously parsed as title such as descriptive text within images", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 7.0, + "description": "section filtering aims to correct blocks erroneously parsed as title such as borderless table headers", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "b", + "relation_name": "", + "weight": 9.0, + "description": "section filtering selects the candidate subset b for analysis", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "l", + "relation_name": "", + "weight": 8.0, + "description": "llm determines the hierarchical level l for each candidate", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 8.0, + "description": "llm analyzes the content c of the candidates", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "f", + "relation_name": "", + "weight": 8.0, + "description": "llm analyzes the layout features f of the candidates", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "b title", + "relation_name": "", + "weight": 9.0, + "description": "section filtering selects the candidate subset b title for analysis", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "b title", + "relation_name": "", + "weight": 9.0, + "description": "the llm analyzes the candidate subset b title to determine properties", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "layout parsing", + "tgt_entity_name": "b title", + "relation_name": "", + "weight": 8.0, + "description": "layout parsing identifies blocks as title forming the subset b title", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "", + "relation_name": "", + "weight": 7.0, + "description": "the llm uses to identify blocks as title candidates", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "l", + "tgt_entity_name": "1", + "relation_name": "", + "weight": 8.0, + "description": "the parameter l uses 1 to represent the root level", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "", + "tgt_entity_name": "none", + "relation_name": "", + "weight": 7.0, + "description": "the final node type can be assigned the value none if a block has no level", + "source_ids": [ + 57 + ] + }, + { + "src_entity_name": "section filtering", + "tgt_entity_name": "text", + "relation_name": "", + "weight": 8.0, + "description": "section filtering corrects blocks erroneously parsed as title by re classifying them as text", + "source_ids": [ + 57 + ] + } + ], + "node_idx": 57 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_58.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_58.json new file mode 100644 index 0000000..e004940 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_58.json @@ -0,0 +1,267 @@ +{ + "entities": [ + { + "entity_name": "tree", + "entity_type": "TASK_OR_PROBLEM", + "description": "the tree is a definitive structure constructed from blocks consisting of nodes and edges representing content and relationships", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "node set", + "entity_type": "TASK_OR_PROBLEM", + "description": "the node set is composed of all blocks from the filtering and re classification process retaining content and final node types", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "edge set", + "entity_type": "TASK_OR_PROBLEM", + "description": "the edge set represents the parent child nesting relationships within the tree structure", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "text", + "entity_type": "PRODUCT", + "description": "text is identified as a final node type retained within the nodes of the tree", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "section", + "entity_type": "PRODUCT", + "description": "section is identified as a final node type retained within the nodes of the tree", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "table", + "entity_type": "PRODUCT", + "description": "table is identified as a final node type retained within the nodes of the tree", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "image", + "entity_type": "PRODUCT", + "description": "image is identified as a final node type retained within the nodes of the tree", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "filtering", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "filtering is a process mentioned as part of the generation of blocks for the node set", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "re classification", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "re classification is a process mentioned alongside filtering in the creation of the node set", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "hierarchical levels", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "hierarchical levels are determined values used to infer parent child relationships for section nodes", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "document order", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "document order is a sequential arrangement used to assemble the complete tree structure", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "parent child nesting relationships", + "entity_type": "TASK_OR_PROBLEM", + "description": "parent child nesting relationships are the specific connections established by the edge set", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "content", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "content is an attribute retained by each node in the node set", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "final node type", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "final node type is an attribute retained by each node in the node set", + "source_ids": [ + 58 + ] + }, + { + "entity_name": "node", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 58 + ] + } + ], + "relations": [ + { + "src_entity_name": "tree", + "tgt_entity_name": "node set", + "relation_name": "", + "weight": 9.0, + "description": "the tree is constructed using the node set which contains all blocks from the filtering process", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "edge set", + "relation_name": "", + "weight": 9.0, + "description": "the tree includes the edge set which establishes parent child nesting relationships", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "text", + "relation_name": "", + "weight": 8.0, + "description": "the node set retains nodes of the type text", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "section", + "relation_name": "", + "weight": 8.0, + "description": "the node set retains nodes of the type section", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 8.0, + "description": "the node set retains nodes of the type table", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "image", + "relation_name": "", + "weight": 8.0, + "description": "the node set retains nodes of the type image", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "edge set", + "tgt_entity_name": "tree", + "relation_name": "", + "weight": 9.0, + "description": "the edge set is established to define the structure of the tree", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "filtering", + "relation_name": "", + "weight": 9.0, + "description": "the node set is composed of blocks resulting from the filtering process", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "node set", + "tgt_entity_name": "re classification", + "relation_name": "", + "weight": 9.0, + "description": "the node set is composed of blocks resulting from the re classification process", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "edge set", + "tgt_entity_name": "parent child nesting relationships", + "relation_name": "", + "weight": 10.0, + "description": "the edge set represents the parent child nesting relationships", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "hierarchical levels", + "relation_name": "", + "weight": 8.0, + "description": "hierarchical levels are used to infer relationships within the tree structure", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "tree", + "tgt_entity_name": "document order", + "relation_name": "", + "weight": 8.0, + "description": "document order is used to assemble the complete tree structure", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "node", + "tgt_entity_name": "content", + "relation_name": "", + "weight": 9.0, + "description": "each node retains its content", + "source_ids": [ + 58 + ] + }, + { + "src_entity_name": "node", + "tgt_entity_name": "final node type", + "relation_name": "", + "weight": 9.0, + "description": "each node retains its final node type", + "source_ids": [ + 58 + ] + } + ], + "node_idx": 58 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_59.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_59.json new file mode 100644 index 0000000..b651214 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_59.json @@ -0,0 +1,419 @@ +{ + "entities": [ + { + "entity_name": "figure 2", + "entity_type": "IMAGE", + "description": "figure 2 is an example shown in the text that illustrates the layout parsing phase", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "layout parsing phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "layout parsing phase is a process that identifies diverse blocks in a document", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "title text table", + "entity_type": "PRODUCT", + "description": "title text table is a type of block identified during the layout parsing phase", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "image", + "entity_type": "IMAGE", + "description": "image is a type of block identified during the layout parsing phase", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "section filtering phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "section filtering phase is a process where title candidates are analyzed by the llm", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "llm", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "llm is a model used to analyze title candidates and re classify blocks", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "method", + "entity_type": "SECTION_TITLE", + "description": "method is a title candidate analyzed during the section filtering phase", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "experiment", + "entity_type": "SECTION_TITLE", + "description": "experiment is a title candidate analyzed during the section filtering phase", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "moe layer", + "entity_type": "SECTION_TITLE", + "description": "moe layer is a title candidate that was erroneously tagged as a title but re classified as a text node", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "section nodes", + "entity_type": "SECTION_TITLE", + "description": "section nodes are blocks identified as having a specific level in the document hierarchy", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "text node", + "entity_type": "SECTION_TITLE", + "description": "text node is a classification for blocks that do not have a specific level in the document hierarchy", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "final tree structure", + "entity_type": "TASK_OR_PROBLEM", + "description": "final tree structure is the result of assembling filtered and classified nodes based on their levels and order", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "fontsize", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "fontsize is a parameter used to describe the size of text blocks such as 14 or 20", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "14", + "entity_type": "MEASUREMENT", + "description": "14 is the specific font size value associated with the method and experiment blocks", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "20", + "entity_type": "MEASUREMENT", + "description": "20 is the specific font size value associated with the moe layer block", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "level", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "level is a parameter used to define the hierarchy depth of document nodes", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "2", + "entity_type": "MEASUREMENT", + "description": "2 is the specific level value assigned to the method and experiment blocks", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "none", + "entity_type": "MEASUREMENT", + "description": "none is the specific level value assigned to the moe layer block indicating no hierarchy level", + "source_ids": [ + 59 + ] + }, + { + "entity_name": "document order", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "document order is a factor used to assemble nodes into the final tree structure", + "source_ids": [ + 59 + ] + } + ], + "relations": [ + { + "src_entity_name": "layout parsing phase", + "tgt_entity_name": "figure 2", + "relation_name": "", + "weight": 9.0, + "description": "figure 2 serves as an example for the layout parsing phase", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "layout parsing phase", + "tgt_entity_name": "title text table", + "relation_name": "", + "weight": 8.0, + "description": "the layout parsing phase identifies title text table as a type of block", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "layout parsing phase", + "tgt_entity_name": "image", + "relation_name": "", + "weight": 8.0, + "description": "the layout parsing phase identifies image as a type of block", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "section filtering phase", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 10.0, + "description": "the section filtering phase uses the llm to analyze title candidates", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "section filtering phase", + "tgt_entity_name": "method", + "relation_name": "", + "weight": 9.0, + "description": "the section filtering phase analyzes method as a title candidate", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "section filtering phase", + "tgt_entity_name": "experiment", + "relation_name": "", + "weight": 9.0, + "description": "the section filtering phase analyzes experiment as a title candidate", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "section filtering phase", + "tgt_entity_name": "moe layer", + "relation_name": "", + "weight": 9.0, + "description": "the section filtering phase analyzes moe layer which was erroneously tagged and re classified", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "method", + "relation_name": "", + "weight": 8.0, + "description": "the llm correctly identifies method as a section node", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "experiment", + "relation_name": "", + "weight": 8.0, + "description": "the llm correctly identifies experiment as a section node", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "moe layer", + "relation_name": "", + "weight": 9.0, + "description": "the llm re classifies moe layer from a title to a text node", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "section filtering phase", + "tgt_entity_name": "final tree structure", + "relation_name": "", + "weight": 7.0, + "description": "the section filtering phase contributes to the creation of the final tree structure", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "layout parsing phase", + "tgt_entity_name": "section filtering phase", + "relation_name": "", + "weight": 6.0, + "description": "the layout parsing phase precedes the section filtering phase in the document processing workflow", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "method", + "tgt_entity_name": "fontsize", + "relation_name": "", + "weight": 10.0, + "description": "the method block has a fontsize of 14", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "method", + "tgt_entity_name": "14", + "relation_name": "", + "weight": 10.0, + "description": "the method block is associated with the measurement value 14", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "method", + "tgt_entity_name": "level", + "relation_name": "", + "weight": 10.0, + "description": "the method block is identified as having a level of 2", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "method", + "tgt_entity_name": "2", + "relation_name": "", + "weight": 10.0, + "description": "the method block is associated with the measurement value 2", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "experiment", + "tgt_entity_name": "fontsize", + "relation_name": "", + "weight": 10.0, + "description": "the experiment block has a fontsize of 14", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "experiment", + "tgt_entity_name": "14", + "relation_name": "", + "weight": 10.0, + "description": "the experiment block is associated with the measurement value 14", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "experiment", + "tgt_entity_name": "level", + "relation_name": "", + "weight": 10.0, + "description": "the experiment block is identified as having a level of 2", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "experiment", + "tgt_entity_name": "2", + "relation_name": "", + "weight": 10.0, + "description": "the experiment block is associated with the measurement value 2", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "moe layer", + "tgt_entity_name": "fontsize", + "relation_name": "", + "weight": 10.0, + "description": "the moe layer block has a fontsize of 20", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "moe layer", + "tgt_entity_name": "20", + "relation_name": "", + "weight": 10.0, + "description": "the moe layer block is associated with the measurement value 20", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "moe layer", + "tgt_entity_name": "level", + "relation_name": "", + "weight": 10.0, + "description": "the moe layer block is identified as having a level of none", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "moe layer", + "tgt_entity_name": "none", + "relation_name": "", + "weight": 10.0, + "description": "the moe layer block is associated with the measurement value none", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "final tree structure", + "tgt_entity_name": "document order", + "relation_name": "", + "weight": 9.0, + "description": "the final tree structure is assembled based on the document order of the nodes", + "source_ids": [ + 59 + ] + }, + { + "src_entity_name": "final tree structure", + "tgt_entity_name": "level", + "relation_name": "", + "weight": 9.0, + "description": "the final tree structure is assembled based on the determined levels of the nodes", + "source_ids": [ + 59 + ] + } + ], + "node_idx": 59 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_6.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_6.json new file mode 100644 index 0000000..86148d1 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_6.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "pvldb", + "entity_type": "PUBLICATION_VENUE", + "description": "pvldb is a publication venue mentioned in the context of artifact availability", + "source_ids": [ + 6 + ] + }, + { + "entity_name": "artifact availability", + "entity_type": "TASK_OR_PROBLEM", + "description": "artifact availability refers to the status or process of making artifacts available as discussed in the text", + "source_ids": [ + 6 + ] + } + ], + "relations": [ + { + "src_entity_name": "pvldb", + "tgt_entity_name": "artifact availability", + "relation_name": "", + "weight": 8.0, + "description": "pvldb is the venue where the topic of artifact availability is addressed", + "source_ids": [ + 6 + ] + } + ], + "node_idx": 6 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_60.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_60.json new file mode 100644 index 0000000..7a0fb82 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_60.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "4", + "entity_type": "MEASUREMENT", + "description": "4 is a numerical value mentioned in the text though its specific context or unit is not provided", + "source_ids": [ + 60 + ] + } + ], + "relations": [], + "node_idx": 60 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_61.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_61.json new file mode 100644 index 0000000..cb7e8c7 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_61.json @@ -0,0 +1,51 @@ +{ + "entities": [ + { + "entity_name": "4.3 graph construction", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'BOOKINDEX' and the second stage of the proposed BookIndex construction process, this section details the method for extracting fine-grained entity knowledge from hierarchical tree nodes and refining it using a novel gradient-based entity resolution technique.", + "source_ids": [ + 61 + ] + }, + { + "entity_name": "graph construction", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The specific two-stage process step that follows Tree Construction, focusing on extracting entity knowledge and performing resolution within the BookIndex framework.", + "source_ids": [ + 61 + ] + }, + { + "entity_name": "gradient-based entity resolution", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A novel algorithmic method described in this section used to refine extracted entity knowledge by resolving ambiguities or duplicates based on gradient optimization.", + "source_ids": [ + 61 + ] + } + ], + "relations": [ + { + "src_entity_name": "graph construction", + "tgt_entity_name": "4.3 graph construction", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Graph Construction' is the primary topic and subject matter of section 4.3.", + "source_ids": [ + 61 + ] + }, + { + "src_entity_name": "gradient-based entity resolution", + "tgt_entity_name": "4.3 graph construction", + "relation_name": "", + "weight": 9.5, + "description": "The method 'Gradient-based Entity Resolution' is a key component and technique detailed within section 4.3.", + "source_ids": [ + 61 + ] + } + ], + "node_idx": 61 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_62.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_62.json new file mode 100644 index 0000000..b1bdc88 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_62.json @@ -0,0 +1,61 @@ +{ + "entities": [ + { + "entity_name": "tree t", + "entity_type": "TASK_OR_PROBLEM", + "description": "tree t is a structure that is established before proceeding to the next step", + "source_ids": [ + 62 + ] + }, + { + "entity_name": "knowledge graph g", + "entity_type": "TASK_OR_PROBLEM", + "description": "knowledge graph g is a structure that is populated by extracting and refining entities from the tree nodes", + "source_ids": [ + 62 + ] + }, + { + "entity_name": "tree nodes", + "entity_type": "TASK_OR_PROBLEM", + "description": "tree nodes are the components within tree t from which entities are extracted and refined", + "source_ids": [ + 62 + ] + } + ], + "relations": [ + { + "src_entity_name": "tree t", + "tgt_entity_name": "knowledge graph g", + "relation_name": "", + "weight": 9.0, + "description": "tree t is the source from which entities are extracted to populate knowledge graph g", + "source_ids": [ + 62 + ] + }, + { + "src_entity_name": "tree t", + "tgt_entity_name": "tree nodes", + "relation_name": "", + "weight": 10.0, + "description": "tree nodes are the constituent parts of tree t that serve as the source for entity extraction", + "source_ids": [ + 62 + ] + }, + { + "src_entity_name": "knowledge graph g", + "tgt_entity_name": "tree nodes", + "relation_name": "", + "weight": 9.0, + "description": "knowledge graph g is populated by extracting and refining entities from tree nodes", + "source_ids": [ + 62 + ] + } + ], + "node_idx": 62 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_63.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_63.json new file mode 100644 index 0000000..e80a7b7 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_63.json @@ -0,0 +1,123 @@ +{ + "entities": [ + { + "entity_name": "4.3.1 kg construction", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Graph Construction' within the 'BOOKINDEX' chapter, this section details the specific algorithm for populating the Knowledge Graph by iterating through tree nodes and extracting subgraphs based on content modality (text or visual).", + "source_ids": [ + 63 + ] + }, + { + "entity_name": "knowledge graph", + "entity_type": "DATASET_OR_CORPUS", + "description": "The structured data repository being constructed in this section, populated by extracting entities and relations from document tree nodes.", + "source_ids": [ + 63 + ] + }, + { + "entity_name": "tree t", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "The hierarchical structure previously established that serves as the source of nodes to be processed for graph construction.", + "source_ids": [ + 63 + ] + }, + { + "entity_name": "llm", + "entity_type": "SOFTWARE", + "description": "Large Language Model used as a tool to extract entities and relations when processing text-only nodes.", + "source_ids": [ + 63 + ] + }, + { + "entity_name": "vision language model", + "entity_type": "SOFTWARE", + "description": "VLM employed specifically to extract visual knowledge from nodes containing image elements.", + "source_ids": [ + 63 + ] + }, + { + "entity_name": "image", + "entity_type": "IMAGE", + "description": "A specific node type indicating the presence of visual elements requiring VLM-based extraction.", + "source_ids": [ + 63 + ] + }, + { + "entity_name": "mapping m", + "entity_type": "EQUATION_OR_FORMULA", + "description": "The final mapping structure constructed by recording the origin tree node for every extracted entity.", + "source_ids": [ + 63 + ] + } + ], + "relations": [ + { + "src_entity_name": "knowledge graph", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 10.0, + "description": "The Knowledge Graph is the primary object being constructed in this section.", + "source_ids": [ + 63 + ] + }, + { + "src_entity_name": "tree t", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 9.5, + "description": "The Tree T provides the input nodes that are iterated over during the construction process.", + "source_ids": [ + 63 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 9.0, + "description": "The LLM is the method/tool utilized for extracting data from text-only nodes within this section.", + "source_ids": [ + 63 + ] + }, + { + "src_entity_name": "vision language model", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 9.0, + "description": "The Vision Language Model is the method/tool utilized for extracting data from visual nodes within this section.", + "source_ids": [ + 63 + ] + }, + { + "src_entity_name": "image", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 8.5, + "description": "The Image node type triggers the use of the Vision Language Model in this section's logic.", + "source_ids": [ + 63 + ] + }, + { + "src_entity_name": "mapping m", + "tgt_entity_name": "4.3.1 kg construction", + "relation_name": "", + "weight": 9.5, + "description": "The Mapping M is the critical output artifact generated by recording entity origins in this section.", + "source_ids": [ + 63 + ] + } + ], + "node_idx": 63 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_64.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_64.json new file mode 100644 index 0000000..e9fc75c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_64.json @@ -0,0 +1,185 @@ +{ + "entities": [ + { + "entity_name": "table", + "entity_type": "PRODUCT", + "description": "table is a specific logical type mentioned in the text that requires preservation of structural semantics", + "source_ids": [ + 64 + ] + }, + { + "entity_name": "formula", + "entity_type": "PRODUCT", + "description": "formula is a specific logical type mentioned in the text that requires preservation of structural semantics", + "source_ids": [ + 64 + ] + }, + { + "entity_name": "v table", + "entity_type": "PRODUCT", + "description": "v table is a distinct typed entity representing the table itself created to preserve structural semantics", + "source_ids": [ + 64 + ] + }, + { + "entity_name": "row", + "entity_type": "PRODUCT", + "description": "row is a component of table nodes that is explicitly extracted as a distinct entity", + "source_ids": [ + 64 + ] + }, + { + "entity_name": "column", + "entity_type": "PRODUCT", + "description": "column is a component of table nodes that is explicitly extracted as a distinct entity", + "source_ids": [ + 64 + ] + }, + { + "entity_name": "header", + "entity_type": "PRODUCT", + "description": "header refers to row and column headers in table nodes that are explicitly extracted as distinct entities", + "source_ids": [ + 64 + ] + }, + { + "entity_name": "structural semantics", + "entity_type": "CONCEPT", + "description": "structural semantics refers to the meaning preserved for specific logical types in the described process", + "source_ids": [ + 64 + ] + }, + { + "entity_name": "logical types", + "entity_type": "CONCEPT", + "description": "logical types are categories of entities such as table and formula that require specific handling", + "source_ids": [ + 64 + ] + }, + { + "entity_name": "node", + "entity_type": "CONCEPT", + "description": "node refers to a specific point in the data structure where content is extracted", + "source_ids": [ + 64 + ] + }, + { + "entity_name": "vertex", + "entity_type": "CONCEPT", + "description": "vertex refers to the primary node v table to which other entities are linked", + "source_ids": [ + 64 + ] + }, + { + "entity_name": "containedin", + "entity_type": "RELATIONSHIP_TYPE", + "description": "containedin is the specific relationship type used to link row and column headers to the table entity", + "source_ids": [ + 64 + ] + } + ], + "relations": [ + { + "src_entity_name": "v table", + "tgt_entity_name": "table", + "relation_name": "", + "weight": 10.0, + "description": "v table is the distinct entity created to represent the table logical type", + "source_ids": [ + 64 + ] + }, + { + "src_entity_name": "row", + "tgt_entity_name": "v table", + "relation_name": "", + "weight": 9.0, + "description": "row headers are linked to v table via a containedin relationship", + "source_ids": [ + 64 + ] + }, + { + "src_entity_name": "column", + "tgt_entity_name": "v table", + "relation_name": "", + "weight": 9.0, + "description": "column headers are linked to v table via a containedin relationship", + "source_ids": [ + 64 + ] + }, + { + "src_entity_name": "header", + "tgt_entity_name": "v table", + "relation_name": "", + "weight": 9.0, + "description": "row and column headers are explicitly extracted and linked to v table", + "source_ids": [ + 64 + ] + }, + { + "src_entity_name": "structural semantics", + "tgt_entity_name": "logical types", + "relation_name": "", + "weight": 9.0, + "description": "structural semantics are preserved specifically for logical types like table and formula", + "source_ids": [ + 64 + ] + }, + { + "src_entity_name": "v table", + "tgt_entity_name": "node", + "relation_name": "", + "weight": 8.0, + "description": "v table is created as a distinct entity from the content of a specific node", + "source_ids": [ + 64 + ] + }, + { + "src_entity_name": "row", + "tgt_entity_name": "header", + "relation_name": "", + "weight": 9.0, + "description": "row headers are a specific type of header extracted from table nodes", + "source_ids": [ + 64 + ] + }, + { + "src_entity_name": "column", + "tgt_entity_name": "header", + "relation_name": "", + "weight": 9.0, + "description": "column headers are a specific type of header extracted from table nodes", + "source_ids": [ + 64 + ] + }, + { + "src_entity_name": "row", + "tgt_entity_name": "column", + "relation_name": "", + "weight": 7.0, + "description": "row and column headers are both explicitly extracted components of table nodes", + "source_ids": [ + 64 + ] + } + ], + "node_idx": 64 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_65.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_65.json new file mode 100644 index 0000000..657fcd6 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_65.json @@ -0,0 +1,51 @@ +{ + "entities": [ + { + "entity_name": "4.3.2 gradient-based entity resolution", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Graph Construction' within the 'BOOKINDEX' chapter, this section details a robust Entity Resolution (ER) process designed to identify and merge fragmented conceptual entities in a Knowledge Graph, addressing challenges like abbreviations and co-references.", + "source_ids": [ + 65 + ] + }, + { + "entity_name": "gradient-based entity resolution", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A specific technique mentioned in the title for resolving entity fragmentation by utilizing gradient-based approaches to refine raw Knowledge Graphs.", + "source_ids": [ + 65 + ] + }, + { + "entity_name": "entity resolution", + "entity_type": "TASK_OR_PROBLEM", + "description": "The core problem addressed in this section: identifying and merging fragmented entities caused by abbreviations, co-references, or varied occurrences to ensure a well-constructed Knowledge Graph.", + "source_ids": [ + 65 + ] + } + ], + "relations": [ + { + "src_entity_name": "gradient-based entity resolution", + "tgt_entity_name": "4.3.2 gradient-based entity resolution", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Gradient-based Entity Resolution' is the primary methodological topic of section 4.3.2.", + "source_ids": [ + 65 + ] + }, + { + "src_entity_name": "entity resolution", + "tgt_entity_name": "4.3.2 gradient-based entity resolution", + "relation_name": "", + "weight": 10.0, + "description": "The task of 'Entity Resolution' is the central subject matter detailed in section 4.3.2.", + "source_ids": [ + 65 + ] + } + ], + "node_idx": 65 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_66.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_66.json new file mode 100644 index 0000000..5f99571 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_66.json @@ -0,0 +1,225 @@ +{ + "entities": [ + { + "entity_name": "er methods", + "entity_type": "TASK_OR_PROBLEM", + "description": "er methods are conventional methods for entity resolution that are computationally expensive", + "source_ids": [ + 66 + ] + }, + { + "entity_name": "dirty er", + "entity_type": "TASK_OR_PROBLEM", + "description": "dirty er is a term used to describe batch processing across multiple data sources for entity resolution", + "source_ids": [ + 66 + ] + }, + { + "entity_name": "llms", + "entity_type": "TECHNOLOGY", + "description": "llms are large language models used for high accuracy judgments in entity resolution", + "source_ids": [ + 66 + ] + }, + { + "entity_name": "a", + "entity_type": "TASK_OR_PROBLEM", + "description": "a is an example entity used to illustrate the merging of multiple entities in the entity resolution process", + "source_ids": [ + 66 + ] + }, + { + "entity_name": "b", + "entity_type": "TASK_OR_PROBLEM", + "description": "b is an example entity used to illustrate the merging of multiple entities in the entity resolution process", + "source_ids": [ + 66 + ] + }, + { + "entity_name": "c", + "entity_type": "TASK_OR_PROBLEM", + "description": "c is an example entity used to illustrate the merging of multiple entities in the entity resolution process", + "source_ids": [ + 66 + ] + }, + { + "entity_name": "12", + "entity_type": "PUBLICATION_VENUE", + "description": "12 is a citation reference mentioned in the text regarding entity resolution methods", + "source_ids": [ + 66 + ] + }, + { + "entity_name": "a b", + "entity_type": "TASK_OR_PROBLEM", + "description": "a b is a specific pairwise comparison example between entities a and b", + "source_ids": [ + 66 + ] + }, + { + "entity_name": "a c", + "entity_type": "TASK_OR_PROBLEM", + "description": "a c is a specific pairwise comparison example between entities a and c", + "source_ids": [ + 66 + ] + }, + { + "entity_name": "b c", + "entity_type": "TASK_OR_PROBLEM", + "description": "b c is a specific pairwise comparison example between entities b and c", + "source_ids": [ + 66 + ] + }, + { + "entity_name": "o n 2", + "entity_type": "MEASUREMENT", + "description": "o n 2 represents the quadratic complexity of the number of pairwise comparisons required", + "source_ids": [ + 66 + ] + } + ], + "relations": [ + { + "src_entity_name": "er methods", + "tgt_entity_name": "dirty er", + "relation_name": "", + "weight": 9.0, + "description": "er methods are often designed for batch processing across multiple data sources commonly referred to as dirty er", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "llms", + "relation_name": "", + "weight": 8.0, + "description": "relying on llms for high accuracy judgments in er methods can lead to prohibitively slow and computationally expensive processes", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "a", + "relation_name": "", + "weight": 8.0, + "description": "er methods aim to merge entities like a b and c as the same concept", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "b", + "relation_name": "", + "weight": 8.0, + "description": "er methods aim to merge entities like a b and c as the same concept", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 8.0, + "description": "er methods aim to merge entities like a b and c as the same concept", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "a b", + "relation_name": "", + "weight": 9.0, + "description": "er methods require finding all possible matching pairs such as a b to confirm equivalence", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "a c", + "relation_name": "", + "weight": 9.0, + "description": "er methods require finding all possible matching pairs such as a c to confirm equivalence", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "b c", + "relation_name": "", + "weight": 9.0, + "description": "er methods require finding all possible matching pairs such as b c to confirm equivalence", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "a", + "tgt_entity_name": "b", + "relation_name": "", + "weight": 7.0, + "description": "a and b are compared as a pair a b to confirm their equivalence", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "a", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 7.0, + "description": "a and c are compared as a pair a c to confirm their equivalence", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "b", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 7.0, + "description": "b and c are compared as a pair b c to confirm their equivalence", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "12", + "relation_name": "", + "weight": 6.0, + "description": "the text cites reference 12 in the context of ensuring accurate entity resolution", + "source_ids": [ + 66 + ] + }, + { + "src_entity_name": "er methods", + "tgt_entity_name": "o n 2", + "relation_name": "", + "weight": 9.0, + "description": "the process of er methods leads to a quadratic o n 2 number of pairwise comparisons", + "source_ids": [ + 66 + ] + } + ], + "node_idx": 66 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_67.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_67.json new file mode 100644 index 0000000..845b621 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_67.json @@ -0,0 +1,169 @@ +{ + "entities": [ + { + "entity_name": "gradient based er method", + "entity_type": "TECHNOLOGY", + "description": "a gradient based entity resolution method employed to process a single document incrementally", + "source_ids": [ + 67 + ] + }, + { + "entity_name": "clean er", + "entity_type": "TASK_OR_PROBLEM", + "description": "a simplified version of the entity resolution task used as the basis for the incremental process", + "source_ids": [ + 67 + ] + }, + { + "entity_name": "database", + "entity_type": "SOFTWARE", + "description": "a storage system containing already processed entities against which new entities are compared", + "source_ids": [ + 67 + ] + }, + { + "entity_name": "top k most relevant candidates", + "entity_type": "EVALUATION_METRIC", + "description": "a set of the most relevant entities used for reranking a new entity in the incremental process", + "source_ids": [ + 67 + ] + }, + { + "entity_name": "entity", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "a single new entity being extracted in the incremental process", + "source_ids": [ + 67 + ] + }, + { + "entity_name": "quadratic batch problem", + "entity_type": "TASK_OR_PROBLEM", + "description": "the original complex problem that the incremental method transforms into a simpler task", + "source_ids": [ + 67 + ] + }, + { + "entity_name": "repeated lookup task", + "entity_type": "TASK_OR_PROBLEM", + "description": "the simplified task resulting from transforming the quadratic batch problem", + "source_ids": [ + 67 + ] + }, + { + "entity_name": "scoring patterns", + "entity_type": "EVALUATION_METRIC", + "description": "distinct observable patterns yielded by the incremental process when reranking entities", + "source_ids": [ + 67 + ] + }, + { + "entity_name": "incremental process", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 67 + ] + } + ], + "relations": [ + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "clean er", + "relation_name": "", + "weight": 9.0, + "description": "the method operates on a single document simplified as the clean er", + "source_ids": [ + 67 + ] + }, + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "database", + "relation_name": "", + "weight": 8.0, + "description": "the method determines where a new entity fits among entities already in the database", + "source_ids": [ + 67 + ] + }, + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "top k most relevant candidates", + "relation_name": "", + "weight": 8.0, + "description": "the method yields scoring patterns when a new entity is reranked against its top k candidates", + "source_ids": [ + 67 + ] + }, + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "entity", + "relation_name": "", + "weight": 10.0, + "description": "the method performs entity resolution incrementally as each new entity is extracted", + "source_ids": [ + 67 + ] + }, + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "quadratic batch problem", + "relation_name": "", + "weight": 9.0, + "description": "the method transforms the quadratic batch problem into a simpler task", + "source_ids": [ + 67 + ] + }, + { + "src_entity_name": "gradient based er method", + "tgt_entity_name": "repeated lookup task", + "relation_name": "", + "weight": 9.0, + "description": "the method transforms the problem into a repeated lookup task", + "source_ids": [ + 67 + ] + }, + { + "src_entity_name": "entity", + "tgt_entity_name": "database", + "relation_name": "", + "weight": 9.0, + "description": "the new entity is determined to fit among the already processed entities in the database", + "source_ids": [ + 67 + ] + }, + { + "src_entity_name": "entity", + "tgt_entity_name": "top k most relevant candidates", + "relation_name": "", + "weight": 9.0, + "description": "the new entity is reranked against its top k most relevant candidates", + "source_ids": [ + 67 + ] + }, + { + "src_entity_name": "incremental process", + "tgt_entity_name": "scoring patterns", + "relation_name": "", + "weight": 8.0, + "description": "the incremental process yields two distinct scoring patterns", + "source_ids": [ + 67 + ] + } + ], + "node_idx": 67 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_68.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_68.json new file mode 100644 index 0000000..ae99b90 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_68.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "5", + "entity_type": "MEASUREMENT", + "description": "5 is a numerical value mentioned in the text potentially representing a count score or measurement", + "source_ids": [ + 68 + ] + } + ], + "relations": [], + "node_idx": 68 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_69.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_69.json new file mode 100644 index 0000000..c578f40 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_69.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "algorithm 1", + "entity_type": "TASK_OR_PROBLEM", + "description": "algorithm 1 is a gradient based entity resolution method mentioned in the text", + "source_ids": [ + 69 + ] + }, + { + "entity_name": "gradient based entity resolution", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "gradient based entity resolution is the specific technique or approach described for the algorithm", + "source_ids": [ + 69 + ] + } + ], + "relations": [ + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "gradient based entity resolution", + "relation_name": "", + "weight": 10.0, + "description": "algorithm 1 is defined as a gradient based entity resolution method", + "source_ids": [ + 69 + ] + } + ], + "node_idx": 69 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_7.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_7.json new file mode 100644 index 0000000..f128f94 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_7.json @@ -0,0 +1,135 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "SOFTWARE", + "description": "bookrag is a software project hosted on github containing source code and data", + "source_ids": [ + 7 + ] + }, + { + "entity_name": "github", + "entity_type": "ORGANIZATION", + "description": "github is the platform where the source code and data for bookrag are made available", + "source_ids": [ + 7 + ] + }, + { + "entity_name": "sam234990", + "entity_type": "PERSON", + "description": "sam234990 is the username or owner associated with the bookrag repository on github", + "source_ids": [ + 7 + ] + }, + { + "entity_name": "source code", + "entity_type": "PRODUCT", + "description": "source code is a digital artifact made available as part of the bookrag project", + "source_ids": [ + 7 + ] + }, + { + "entity_name": "data", + "entity_type": "PRODUCT", + "description": "data is a digital artifact made available as part of the bookrag project", + "source_ids": [ + 7 + ] + }, + { + "entity_name": "artifacts", + "entity_type": "PRODUCT", + "description": "artifacts are items made available alongside the source code and data for the bookrag project", + "source_ids": [ + 7 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "github", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is hosted on the github platform as indicated by the provided url", + "source_ids": [ + 7 + ] + }, + { + "src_entity_name": "sam234990", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "sam234990 is the creator or owner of the bookrag repository", + "source_ids": [ + 7 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "source code", + "relation_name": "", + "weight": 10.0, + "description": "bookrag contains the source code that has been made available", + "source_ids": [ + 7 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "data", + "relation_name": "", + "weight": 10.0, + "description": "bookrag contains the data that has been made available", + "source_ids": [ + 7 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "artifacts", + "relation_name": "", + "weight": 10.0, + "description": "bookrag includes other artifacts that have been made available", + "source_ids": [ + 7 + ] + }, + { + "src_entity_name": "source code", + "tgt_entity_name": "github", + "relation_name": "", + "weight": 9.0, + "description": "the source code is hosted on github", + "source_ids": [ + 7 + ] + }, + { + "src_entity_name": "data", + "tgt_entity_name": "github", + "relation_name": "", + "weight": 9.0, + "description": "the data is hosted on github", + "source_ids": [ + 7 + ] + }, + { + "src_entity_name": "artifacts", + "tgt_entity_name": "github", + "relation_name": "", + "weight": 9.0, + "description": "the artifacts are hosted on github", + "source_ids": [ + 7 + ] + } + ], + "node_idx": 7 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_70.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_70.json new file mode 100644 index 0000000..cb4f9b3 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_70.json @@ -0,0 +1,219 @@ +{ + "entities": [ + { + "entity_name": "kg g", + "entity_type": "TASK_OR_PROBLEM", + "description": "kg g is a knowledge graph that serves as the input for the described process", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "new entity v n", + "entity_type": "TASK_OR_PROBLEM", + "description": "new entity v n is a new entity being introduced into the system", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "rerank model r", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "rerank model r is a model used to rerank entities in the process", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "entity vector database db", + "entity_type": "DATASET_OR_CORPUS", + "description": "entity vector database db is a database storing entity vectors", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "vector search number top k", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "vector search number top k is a parameter defining the number of top results for vector search", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "threshold of gradient g", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "threshold of gradient g is a threshold value used for gradient calculations", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "kg", + "entity_type": "TASK_OR_PROBLEM", + "description": "kg is the abbreviation for the knowledge graph mentioned as input", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "g", + "entity_type": "TASK_OR_PROBLEM", + "description": "g is the specific instance or variable name for the knowledge graph", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "v", + "entity_type": "TASK_OR_PROBLEM", + "description": "v is the variable representing the new entity", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "n", + "entity_type": "TASK_OR_PROBLEM", + "description": "n is a subscript or identifier associated with the new entity v", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "r", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "r is the specific variable name for the rerank model", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "db", + "entity_type": "DATASET_OR_CORPUS", + "description": "db is the specific variable name for the entity vector database", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "top k", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "top k is the specific variable name for the vector search number", + "source_ids": [ + 70 + ] + }, + { + "entity_name": "g", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "g is the specific variable name for the threshold of gradient", + "source_ids": [ + 70 + ] + } + ], + "relations": [ + { + "src_entity_name": "kg g", + "tgt_entity_name": "new entity v n", + "relation_name": "", + "weight": 8.0, + "description": "the new entity v n is added to or processed within the knowledge graph g", + "source_ids": [ + 70 + ] + }, + { + "src_entity_name": "rerank model r", + "tgt_entity_name": "entity vector database db", + "relation_name": "", + "weight": 7.0, + "description": "the rerank model r likely utilizes the entity vector database db to perform its ranking tasks", + "source_ids": [ + 70 + ] + }, + { + "src_entity_name": "vector search number top k", + "tgt_entity_name": "entity vector database db", + "relation_name": "", + "weight": 9.0, + "description": "the vector search number top k parameter determines the scope of the search performed on the entity vector database db", + "source_ids": [ + 70 + ] + }, + { + "src_entity_name": "threshold of gradient g", + "tgt_entity_name": "rerank model r", + "relation_name": "", + "weight": 6.0, + "description": "the threshold of gradient g is a parameter that influences the operation or convergence of the rerank model r", + "source_ids": [ + 70 + ] + }, + { + "src_entity_name": "kg", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 10.0, + "description": "kg and g refer to the same knowledge graph entity with g being its variable representation", + "source_ids": [ + 70 + ] + }, + { + "src_entity_name": "v", + "tgt_entity_name": "n", + "relation_name": "", + "weight": 9.0, + "description": "n is a subscript or modifier defining the specific instance of the new entity v", + "source_ids": [ + 70 + ] + }, + { + "src_entity_name": "rerank model r", + "tgt_entity_name": "r", + "relation_name": "", + "weight": 10.0, + "description": "r is the variable name for the rerank model", + "source_ids": [ + 70 + ] + }, + { + "src_entity_name": "entity vector database db", + "tgt_entity_name": "db", + "relation_name": "", + "weight": 10.0, + "description": "db is the variable name for the entity vector database", + "source_ids": [ + 70 + ] + }, + { + "src_entity_name": "vector search number top k", + "tgt_entity_name": "top k", + "relation_name": "", + "weight": 10.0, + "description": "top k is the variable name for the vector search number", + "source_ids": [ + 70 + ] + }, + { + "src_entity_name": "threshold of gradient g", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 10.0, + "description": "g is the variable name for the threshold of gradient", + "source_ids": [ + 70 + ] + } + ], + "node_idx": 70 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_71.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_71.json new file mode 100644 index 0000000..256ac4c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_71.json @@ -0,0 +1,265 @@ +{ + "entities": [ + { + "entity_name": "vector search", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "db", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "search", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "r", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "e", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "sort", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "gradient select", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "top k", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "v n", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "e c", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "v cn", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "s", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "c", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "score", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "s 0", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + }, + { + "entity_name": "sel", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 71 + ] + } + ], + "relations": [ + { + "src_entity_name": "vector search", + "tgt_entity_name": "db", + "relation_name": "", + "weight": 9.0, + "description": "vector search operates on the db to find relevant entities", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "search", + "tgt_entity_name": "db", + "relation_name": "", + "weight": 9.0, + "description": "the search function is applied to the db to retrieve entities", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "r", + "tgt_entity_name": "e", + "relation_name": "", + "weight": 8.0, + "description": "the function r takes entities e as input to process them", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "sort", + "tgt_entity_name": "e", + "relation_name": "", + "weight": 8.0, + "description": "the sort operation is applied to the list of entities e", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "gradient select", + "tgt_entity_name": "e", + "relation_name": "", + "weight": 8.0, + "description": "gradient select is used to select entities from the remaining list e", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "vector search", + "tgt_entity_name": "top k", + "relation_name": "", + "weight": 9.0, + "description": "vector search utilizes the top k parameter to limit the number of relevant entities found", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "search", + "tgt_entity_name": "v n", + "relation_name": "", + "weight": 9.0, + "description": "the search function uses the vector v n as its query input", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "search", + "tgt_entity_name": "e c", + "relation_name": "", + "weight": 9.0, + "description": "the search function outputs the candidate entities e c", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "r", + "tgt_entity_name": "v cn", + "relation_name": "", + "weight": 8.0, + "description": "the function r uses the vector v cn to calculate rerank scores", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "sort", + "tgt_entity_name": "s", + "relation_name": "", + "weight": 9.0, + "description": "the sort operation generates the sorted list s", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "sort", + "tgt_entity_name": "c", + "relation_name": "", + "weight": 9.0, + "description": "the sort operation orders entities based on the rerank scores c", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "score", + "tgt_entity_name": "s 0", + "relation_name": "", + "weight": 10.0, + "description": "the score variable is assigned the value of the first element s 0 from the sorted list", + "source_ids": [ + 71 + ] + }, + { + "src_entity_name": "gradient select", + "tgt_entity_name": "sel", + "relation_name": "", + "weight": 9.0, + "description": "the gradient select method produces the selected entities sel", + "source_ids": [ + 71 + ] + } + ], + "node_idx": 71 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_72.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_72.json new file mode 100644 index 0000000..28fab6f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_72.json @@ -0,0 +1,75 @@ +{ + "entities": [ + { + "entity_name": "case a", + "entity_type": "TASK_OR_PROBLEM", + "description": "case a is a scenario described as involving a new conceptual entity", + "source_ids": [ + 72 + ] + }, + { + "entity_name": "new entity", + "entity_type": "TASK_OR_PROBLEM", + "description": "new entity refers to a conceptual entity that is being evaluated for relevance against existing entities", + "source_ids": [ + 72 + ] + }, + { + "entity_name": "existing entities", + "entity_type": "TASK_OR_PROBLEM", + "description": "existing entities are the set of entities against which the relevance of a new conceptual entity is measured", + "source_ids": [ + 72 + ] + }, + { + "entity_name": "relevance scores", + "entity_type": "EVALUATION_METRIC", + "description": "relevance scores are the metrics used to measure the relationship between the new entity and existing entities", + "source_ids": [ + 72 + ] + }, + { + "entity_name": "gradient", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "gradient refers to a mathematical pattern or value that is absent in the relevance scores for new entities", + "source_ids": [ + 72 + ] + }, + { + "entity_name": "discriminative pattern", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "discriminative pattern refers to a distinguishing feature or trend that is not present in the relevance scores for new entities", + "source_ids": [ + 72 + ] + } + ], + "relations": [ + { + "src_entity_name": "case a", + "tgt_entity_name": "new entity", + "relation_name": "", + "weight": 9.0, + "description": "case a describes the scenario where a new entity is introduced and evaluated", + "source_ids": [ + 72 + ] + }, + { + "src_entity_name": "new entity", + "tgt_entity_name": "existing entities", + "relation_name": "", + "weight": 10.0, + "description": "the new entity s relevance scores are calculated against all existing entities", + "source_ids": [ + 72 + ] + } + ], + "node_idx": 72 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_73.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_73.json new file mode 100644 index 0000000..c22b0b0 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_73.json @@ -0,0 +1,187 @@ +{ + "entities": [ + { + "entity_name": "case b", + "entity_type": "TASK_OR_PROBLEM", + "description": "case b refers to a scenario involving an existing entity where an alias is being evaluated for relevance", + "source_ids": [ + 73 + ] + }, + { + "entity_name": "reranker", + "entity_type": "TECHNOLOGY", + "description": "the reranker is a system or component described as having inherent discriminative limitations", + "source_ids": [ + 73 + ] + }, + { + "entity_name": "existing entity", + "entity_type": "TASK_OR_PROBLEM", + "description": "existing entity refers to an entity that is already present in the system being discussed", + "source_ids": [ + 73 + ] + }, + { + "entity_name": "alias", + "entity_type": "CONCEPT", + "description": "alias is a term used to describe an alternative name for an existing entity", + "source_ids": [ + 73 + ] + }, + { + "entity_name": "scores", + "entity_type": "EVALUATION_METRIC", + "description": "scores are the numerical values indicating the relevance of an alias to a true match", + "source_ids": [ + 73 + ] + }, + { + "entity_name": "true match", + "entity_type": "CONCEPT", + "description": "true match refers to the correct entity that an alias is being compared against", + "source_ids": [ + 73 + ] + }, + { + "entity_name": "equivalent aliases", + "entity_type": "CONCEPT", + "description": "equivalent aliases refers to a small set of aliases that are considered the same as the true match", + "source_ids": [ + 73 + ] + }, + { + "entity_name": "gradient", + "entity_type": "MEASUREMENT", + "description": "gradient refers to the sharp decline in relevance scores mentioned in the text", + "source_ids": [ + 73 + ] + }, + { + "entity_name": "irrelevant entities", + "entity_type": "TASK_OR_PROBLEM", + "description": "irrelevant entities are the entities that follow the sharp decline in relevance scores", + "source_ids": [ + 73 + ] + }, + { + "entity_name": "", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 73 + ] + } + ], + "relations": [ + { + "src_entity_name": "", + "tgt_entity_name": "case b", + "relation_name": "", + "weight": 10.0, + "description": "is the specific alias being discussed within the scenario defined as case b", + "source_ids": [ + 73 + ] + }, + { + "src_entity_name": "", + "tgt_entity_name": "reranker", + "relation_name": "", + "weight": 8.0, + "description": "the scores of are influenced by the inherent discriminative limitations of the reranker", + "source_ids": [ + 73 + ] + }, + { + "src_entity_name": "case b", + "tgt_entity_name": "existing entity", + "relation_name": "", + "weight": 10.0, + "description": "case b is defined by the scenario involving an existing entity", + "source_ids": [ + 73 + ] + }, + { + "src_entity_name": "case b", + "tgt_entity_name": "alias", + "relation_name": "", + "weight": 9.0, + "description": "case b specifically addresses the situation where an alias is being evaluated", + "source_ids": [ + 73 + ] + }, + { + "src_entity_name": "alias", + "tgt_entity_name": "true match", + "relation_name": "", + "weight": 9.0, + "description": "the alias is evaluated for its relevance to the true match", + "source_ids": [ + 73 + ] + }, + { + "src_entity_name": "scores", + "tgt_entity_name": "true match", + "relation_name": "", + "weight": 8.0, + "description": "scores indicate the relevance of the alias to the true match", + "source_ids": [ + 73 + ] + }, + { + "src_entity_name": "scores", + "tgt_entity_name": "equivalent aliases", + "relation_name": "", + "weight": 8.0, + "description": "scores show high relevance to the true match or a set of equivalent aliases", + "source_ids": [ + 73 + ] + }, + { + "src_entity_name": "reranker", + "tgt_entity_name": "scores", + "relation_name": "", + "weight": 7.0, + "description": "the reranker s limitations affect the initial set of high relevance scores", + "source_ids": [ + 73 + ] + }, + { + "src_entity_name": "scores", + "tgt_entity_name": "gradient", + "relation_name": "", + "weight": 8.0, + "description": "the scores exhibit a sharp decline gradient after the initial high relevance set", + "source_ids": [ + 73 + ] + }, + { + "src_entity_name": "gradient", + "tgt_entity_name": "irrelevant entities", + "relation_name": "", + "weight": 7.0, + "description": "the gradient precedes the transition to irrelevant entities", + "source_ids": [ + 73 + ] + } + ], + "node_idx": 73 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_74.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_74.json new file mode 100644 index 0000000..766e3c9 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_74.json @@ -0,0 +1,115 @@ +{ + "entities": [ + { + "entity_name": "gradient based er algorithm", + "entity_type": "TECHNOLOGY", + "description": "the gradient based er algorithm is a method designed to detect sharp declines characteristic of case b and isolate high relevance sets", + "source_ids": [ + 74 + ] + }, + { + "entity_name": "llm", + "entity_type": "TECHNOLOGY", + "description": "an llm is utilized for finer grained distinction when multiple similar entities are identified within a set", + "source_ids": [ + 74 + ] + }, + { + "entity_name": "case b", + "entity_type": "TASK_OR_PROBLEM", + "description": "case b is a scenario characterized by a sharp decline that the gradient based er algorithm is designed to detect", + "source_ids": [ + 74 + ] + }, + { + "entity_name": "case a", + "entity_type": "TASK_OR_PROBLEM", + "description": "case a is a no gradient scenario that the llm helps differentiate from the set identified by the algorithm", + "source_ids": [ + 74 + ] + }, + { + "entity_name": "high relevance set", + "entity_type": "DATASET_OR_CORPUS", + "description": "the high relevance set is a collection of entities isolated by the gradient based er algorithm for further processing", + "source_ids": [ + 74 + ] + }, + { + "entity_name": "similar entities", + "entity_type": "DATASET_OR_CORPUS", + "description": "similar entities are a group of items identified within the high relevance set that require finer grained distinction", + "source_ids": [ + 74 + ] + } + ], + "relations": [ + { + "src_entity_name": "gradient based er algorithm", + "tgt_entity_name": "case b", + "relation_name": "", + "weight": 10.0, + "description": "the gradient based er algorithm is designed to detect the sharp decline characteristic of case b", + "source_ids": [ + 74 + ] + }, + { + "src_entity_name": "gradient based er algorithm", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "the gradient based er algorithm isolates a set of entities which is subsequently processed by an llm for finer grained distinction", + "source_ids": [ + 74 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "case a", + "relation_name": "", + "weight": 9.0, + "description": "the llm is used to differentiate the identified set from the no gradient scenario of case a", + "source_ids": [ + 74 + ] + }, + { + "src_entity_name": "gradient based er algorithm", + "tgt_entity_name": "high relevance set", + "relation_name": "", + "weight": 10.0, + "description": "the gradient based er algorithm efficiently isolates the high relevance set", + "source_ids": [ + 74 + ] + }, + { + "src_entity_name": "llm", + "tgt_entity_name": "similar entities", + "relation_name": "", + "weight": 9.0, + "description": "the llm is utilized to distinguish between multiple similar entities identified within the set", + "source_ids": [ + 74 + ] + }, + { + "src_entity_name": "high relevance set", + "tgt_entity_name": "similar entities", + "relation_name": "", + "weight": 8.0, + "description": "the similar entities are contained within the high relevance set identified by the algorithm", + "source_ids": [ + 74 + ] + } + ], + "node_idx": 74 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_75.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_75.json new file mode 100644 index 0000000..da4259e --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_75.json @@ -0,0 +1,437 @@ +{ + "entities": [ + { + "entity_name": "algorithm 1", + "entity_type": "TASK_OR_PROBLEM", + "description": "algorithm 1 is the entity resolution process described in the text", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "v n", + "entity_type": "TASK_OR_PROBLEM", + "description": "v n is a new entity being processed in the entity resolution process", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "e c", + "entity_type": "TASK_OR_PROBLEM", + "description": "e c represents the top k candidates retrieved for the new entity v n", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "db", + "entity_type": "TASK_OR_PROBLEM", + "description": "db is the vector database from which candidates are retrieved", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "r", + "entity_type": "TASK_OR_PROBLEM", + "description": "r is the reranker used to re rank candidates against v n", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "s", + "entity_type": "TASK_OR_PROBLEM", + "description": "s represents the scores assigned to the candidates", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "sel", + "entity_type": "TASK_OR_PROBLEM", + "description": "sel is the selection set initialized with the top scoring candidate", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "g", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "g is the gradient threshold used to check score drops", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "case a", + "entity_type": "TASK_OR_PROBLEM", + "description": "case a occurs when all candidates pass the gradient check indicating scores lacked discriminative power", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "case b", + "entity_type": "TASK_OR_PROBLEM", + "description": "case b occurs when a gradient is found signaling a sharp score drop", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "v sel", + "entity_type": "TASK_OR_PROBLEM", + "description": "v sel is the canonical entity selected from the selection set sel", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "llm", + "entity_type": "SOFTWARE", + "description": "llm is a tool used to select the canonical entity if multiple aliases are identified", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "g", + "entity_type": "TASK_OR_PROBLEM", + "description": "g is a data structure or set that is updated and returned at the end of the process", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "lines 1 3", + "entity_type": "SECTION_TITLE", + "description": "lines 1 3 describe the initial retrieval and reranking steps of the algorithm", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "line 4", + "entity_type": "SECTION_TITLE", + "description": "line 4 describes the initialization of the selection set and the initial score", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "lines 5 8", + "entity_type": "SECTION_TITLE", + "description": "lines 5 8 describe the iteration through remaining candidates and the gradient threshold check", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "lines 7 8", + "entity_type": "SECTION_TITLE", + "description": "lines 7 8 detail the logic for adding candidates to the selection set and updating scores", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "line 8", + "entity_type": "SECTION_TITLE", + "description": "line 8 describes the condition where the loop breaks upon detecting a sharp score drop", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "lines 9 14", + "entity_type": "SECTION_TITLE", + "description": "lines 9 14 describe the final decision making logic of the algorithm", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "line 9 10", + "entity_type": "SECTION_TITLE", + "description": "lines 9 10 describe the action taken in case a where a new entity is added", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "lines 12 14", + "entity_type": "SECTION_TITLE", + "description": "lines 12 14 describe the merging of the new entity with the canonical entity in case b", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "line 13", + "entity_type": "SECTION_TITLE", + "description": "line 13 describes the use of an llm to select a canonical entity when multiple aliases exist", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "line 15", + "entity_type": "SECTION_TITLE", + "description": "line 15 describes the return of the updated g and db structures", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "score", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "score is the variable updated during the iteration to track the current score value", + "source_ids": [ + 75 + ] + }, + { + "entity_name": "v c", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "v c represents the current candidate being evaluated in the iteration", + "source_ids": [ + 75 + ] + } + ], + "relations": [ + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "v n", + "relation_name": "", + "weight": 10.0, + "description": "algorithm 1 processes the new entity v n by retrieving candidates and making a decision", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "e c", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 retrieves the top k candidates e c from the vector database db", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "db", + "relation_name": "", + "weight": 9.0, + "description": "the vector database db is the source from which candidates e c are retrieved", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "r", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 uses the reranker r to re rank candidates e c against v n", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "s", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 sorts candidates based on their scores s", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "sel", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 initializes and iterates through the selection set sel", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 8.0, + "description": "algorithm 1 uses the gradient threshold g to determine if a score drop is sharp", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "case a", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 identifies case a when the selection set sel is identical to e c", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "case b", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 identifies case b when a gradient is found in the selection set sel", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "v sel", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 selects the canonical entity v sel from the selection set sel in case b", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "algorithm 1 uses an llm to select v sel if multiple aliases are identified", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "lines 1 3", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 executes the steps outlined in lines 1 3 to retrieve and rerank candidates", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "line 4", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 executes the initialization step described in line 4", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "lines 5 8", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 executes the iteration logic described in lines 5 8", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "lines 7 8", + "relation_name": "", + "weight": 8.0, + "description": "the logic in lines 7 8 is part of the iteration process within algorithm 1", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "line 8", + "relation_name": "", + "weight": 8.0, + "description": "line 8 defines the break condition within the loop of algorithm 1", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "lines 9 14", + "relation_name": "", + "weight": 9.0, + "description": "algorithm 1 executes the decision logic described in lines 9 14", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "line 9 10", + "relation_name": "", + "weight": 8.0, + "description": "lines 9 10 are the specific actions taken when case a is identified in algorithm 1", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "lines 12 14", + "relation_name": "", + "weight": 8.0, + "description": "lines 12 14 are the specific actions taken when case b is identified in algorithm 1", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "line 13", + "relation_name": "", + "weight": 8.0, + "description": "line 13 is a step within the case b logic of algorithm 1", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "line 15", + "relation_name": "", + "weight": 9.0, + "description": "line 15 is the final step of algorithm 1 where results are returned", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "score", + "relation_name": "", + "weight": 9.0, + "description": "the variable score is initialized and updated during the execution of algorithm 1", + "source_ids": [ + 75 + ] + }, + { + "src_entity_name": "algorithm 1", + "tgt_entity_name": "v c", + "relation_name": "", + "weight": 9.0, + "description": "the variable v c is the current candidate processed within the loop of algorithm 1", + "source_ids": [ + 75 + ] + } + ], + "node_idx": 75 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_76.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_76.json new file mode 100644 index 0000000..9c08336 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_76.json @@ -0,0 +1,233 @@ +{ + "entities": [ + { + "entity_name": "figure 2", + "entity_type": "IMAGE", + "description": "figure 2 is a visual representation used to illustrate the processing of a new entity in a knowledge graph", + "source_ids": [ + 76 + ] + }, + { + "entity_name": "kg", + "entity_type": "TASK_OR_PROBLEM", + "description": "kg refers to the knowledge graph where entities are processed compared and merged", + "source_ids": [ + 76 + ] + }, + { + "entity_name": "e 9", + "entity_type": "TASK_OR_PROBLEM", + "description": "e 9 is a new entity being processed and compared against existing entities in the kg", + "source_ids": [ + 76 + ] + }, + { + "entity_name": "e 6", + "entity_type": "TASK_OR_PROBLEM", + "description": "e 6 is an existing entity in the kg that shows a sharp decline in similarity with e 9", + "source_ids": [ + 76 + ] + }, + { + "entity_name": "e 8", + "entity_type": "TASK_OR_PROBLEM", + "description": "e 8 is an existing entity in the kg that shows a sharp decline in similarity with e 9", + "source_ids": [ + 76 + ] + }, + { + "entity_name": "e 5", + "entity_type": "TASK_OR_PROBLEM", + "description": "e 5 is an existing entity in the kg that shows a sharp decline in similarity with e 9", + "source_ids": [ + 76 + ] + }, + { + "entity_name": "e 7", + "entity_type": "TASK_OR_PROBLEM", + "description": "e 7 is the final merged entity resulting from the consolidation of e 9 and e 7", + "source_ids": [ + 76 + ] + }, + { + "entity_name": "similarity curve", + "entity_type": "IMAGE", + "description": "the similarity curve is a visual depiction orange line showing the similarity levels between entities", + "source_ids": [ + 76 + ] + }, + { + "entity_name": "gradient based selection process", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the gradient based selection process is the method used to identify high confidence matches between entities", + "source_ids": [ + 76 + ] + }, + { + "entity_name": "orange line", + "entity_type": "IMAGE", + "description": "the orange line is a specific visual element within the similarity curve mentioned in the text", + "source_ids": [ + 76 + ] + }, + { + "entity_name": "unique high confidence match", + "entity_type": "CONCEPT", + "description": "a unique high confidence match is the result of the gradient based selection process identifying e 7 for e 9", + "source_ids": [ + 76 + ] + }, + { + "entity_name": "consolidated information", + "entity_type": "CONCEPT", + "description": "consolidated information refers to the enriched data resulting from merging entities in the kg", + "source_ids": [ + 76 + ] + } + ], + "relations": [ + { + "src_entity_name": "e 9", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "e 9 is processed within the kg context", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "e 7", + "relation_name": "", + "weight": 10.0, + "description": "e 9 shows high similarity with e 7 and is merged with it", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "e 6", + "relation_name": "", + "weight": 7.0, + "description": "e 9 shows a sharp decline in similarity with e 6", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "e 8", + "relation_name": "", + "weight": 7.0, + "description": "e 9 shows a sharp decline in similarity with e 8", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "e 5", + "relation_name": "", + "weight": 7.0, + "description": "e 9 shows a sharp decline in similarity with e 5", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "gradient based selection process", + "tgt_entity_name": "e 7", + "relation_name": "", + "weight": 9.0, + "description": "the gradient based selection process identifies e 7 as the match for e 9", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "similarity curve", + "tgt_entity_name": "e 9", + "relation_name": "", + "weight": 8.0, + "description": "the similarity curve depicts the similarity of e 9 with other entities", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "similarity curve", + "tgt_entity_name": "e 7", + "relation_name": "", + "weight": 8.0, + "description": "the similarity curve shows e 9 s high similarity with e 7", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "similarity curve", + "tgt_entity_name": "orange line", + "relation_name": "", + "weight": 10.0, + "description": "the orange line is the visual representation of the similarity curve described in the text", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "gradient based selection process", + "tgt_entity_name": "unique high confidence match", + "relation_name": "", + "weight": 10.0, + "description": "the gradient based selection process produces the unique high confidence match", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "unique high confidence match", + "relation_name": "", + "weight": 9.0, + "description": "e 9 is the entity for which the unique high confidence match e 7 is identified", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "e 9", + "tgt_entity_name": "consolidated information", + "relation_name": "", + "weight": 8.0, + "description": "the merging of e 9 with e 7 enriches the kg with consolidated information", + "source_ids": [ + 76 + ] + }, + { + "src_entity_name": "e 7", + "tgt_entity_name": "e 7", + "relation_name": "", + "weight": 9.0, + "description": "e 7 is the component of the final merged entity e 7 that incorporates e 9", + "source_ids": [ + 76 + ] + } + ], + "node_idx": 76 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_77.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_77.json new file mode 100644 index 0000000..6e60198 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_77.json @@ -0,0 +1,315 @@ +{ + "entities": [ + { + "entity_name": "gt link", + "entity_type": "TECHNOLOGY", + "description": "gt link is a formalized mechanism denoted as m used to complete the bookindex", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "bookindex is a structure denoted as b composed of t g and m which gt link helps complete", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "kg construction phase", + "entity_type": "TASK_OR_PROBLEM", + "description": "kg construction phase is a specific stage described in the text where origin tree nodes are recorded for extracted entities", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "entity resolution", + "entity_type": "TASK_OR_PROBLEM", + "description": "entity resolution is a process during which gt link is refined by merging entities into canonical entities", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "origin tree node", + "entity_type": "HARDWARE", + "description": "origin tree node is a structural location recorded for every newly extracted entity", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "canonical entity", + "entity_type": "CONCEPT", + "description": "canonical entity is the target of merging during entity resolution receiving updated origin node sets", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "mapping m", + "entity_type": "EQUATION_OR_FORMULA", + "description": "mapping m is the final aggregation process defined as v to p n linking entities to structural locations", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "g", + "entity_type": "CONCEPT", + "description": "g is a component of the bookindex structure b", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "t", + "entity_type": "CONCEPT", + "description": "t is a component of the bookindex structure b and represents the set of structural locations nodes", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "v i", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "v i represents a newly extracted entity for which an origin tree node is recorded", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "v n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "v n represents an entity that is merged into a canonical entity during entity resolution", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "v sel", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "v sel represents the canonical entity into which v n is merged", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "n", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "n represents the set of nodes in the origin tree t", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "v", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "v represents the set of entities in the graph g", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "p n", + "entity_type": "MATHEMATICAL_CONCEPT", + "description": "p n represents the power set of nodes n used in the definition of the mapping m", + "source_ids": [ + 77 + ] + }, + { + "entity_name": "m", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 77 + ] + } + ], + "relations": [ + { + "src_entity_name": "gt link", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "gt link is formalized to complete the bookindex", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "kg construction phase", + "tgt_entity_name": "origin tree node", + "relation_name": "", + "weight": 9.0, + "description": "during the kg construction phase origin tree nodes are recorded for newly extracted entities", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "entity resolution", + "tgt_entity_name": "gt link", + "relation_name": "", + "weight": 9.0, + "description": "gt link is refined during the entity resolution process", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "entity resolution", + "tgt_entity_name": "canonical entity", + "relation_name": "", + "weight": 9.0, + "description": "during entity resolution entities are merged into a canonical entity", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "mapping m", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 8.0, + "description": "the mapping m bi directionally links the entities in g to their structural locations", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "mapping m", + "tgt_entity_name": "t", + "relation_name": "", + "weight": 8.0, + "description": "the mapping m links entities to the set of their structural locations nodes in t", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 7.0, + "description": "g is a component of the bookindex structure b", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "t", + "relation_name": "", + "weight": 7.0, + "description": "t is a component of the bookindex structure b", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "m", + "relation_name": "", + "weight": 7.0, + "description": "m is a component of the bookindex structure b", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "kg construction phase", + "tgt_entity_name": "v i", + "relation_name": "", + "weight": 9.0, + "description": "the kg construction phase records the origin tree node for every newly extracted entity v i", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "entity resolution", + "tgt_entity_name": "v n", + "relation_name": "", + "weight": 9.0, + "description": "during entity resolution the entity v n is merged into a canonical entity", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "entity resolution", + "tgt_entity_name": "v sel", + "relation_name": "", + "weight": 9.0, + "description": "the entity v n is merged into the canonical entity v sel during entity resolution", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "v sel", + "tgt_entity_name": "v n", + "relation_name": "", + "weight": 8.0, + "description": "v sel is the target entity that receives the origin nodes previously associated with v n", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "mapping m", + "tgt_entity_name": "v", + "relation_name": "", + "weight": 9.0, + "description": "the mapping m is defined as a function from the set of entities v", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "mapping m", + "tgt_entity_name": "p n", + "relation_name": "", + "weight": 9.0, + "description": "the mapping m maps entities to the power set of nodes p n", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "v i", + "relation_name": "", + "weight": 6.0, + "description": "the bookindex structure b involves the recording of origin nodes for entities like v i", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "origin tree node", + "tgt_entity_name": "v i", + "relation_name": "", + "weight": 9.0, + "description": "an origin tree node is recorded specifically for the entity v i", + "source_ids": [ + 77 + ] + }, + { + "src_entity_name": "origin tree node", + "tgt_entity_name": "v sel", + "relation_name": "", + "weight": 8.0, + "description": "the origin node set of v sel is updated to include nodes from v n", + "source_ids": [ + 77 + ] + } + ], + "node_idx": 77 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_78.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_78.json new file mode 100644 index 0000000..43e2bf9 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_78.json @@ -0,0 +1,51 @@ +{ + "entities": [ + { + "entity_name": "5 agent-based retrieval", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of the main paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section details the proposed agent-based query method inspired by Information Foraging Theory, which dynamically classifies queries and employs a tailored retrieval workflow.", + "source_ids": [ + 78 + ] + }, + { + "entity_name": "agent-based query method", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Refers to the specific retrieval strategy introduced in section 5, which utilizes agents to dynamically classify queries based on Information Foraging Theory.", + "source_ids": [ + 78 + ] + }, + { + "entity_name": "information foraging theory", + "entity_type": "SCIENTIFIC_THEORY", + "description": "The theoretical framework inspiring the design of the agent-based query method described in section 5.", + "source_ids": [ + 78 + ] + } + ], + "relations": [ + { + "src_entity_name": "agent-based query method", + "tgt_entity_name": "5 agent-based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "The 'Agent-Based Query Method' is the primary technical contribution and topic detailed within section 5.", + "source_ids": [ + 78 + ] + }, + { + "src_entity_name": "information foraging theory", + "tgt_entity_name": "5 agent-based retrieval", + "relation_name": "", + "weight": 9.5, + "description": "'Information Foraging Theory' serves as the foundational inspiration for the methods discussed in section 5.", + "source_ids": [ + 78 + ] + } + ], + "node_idx": 78 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_79.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_79.json new file mode 100644 index 0000000..9084080 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_79.json @@ -0,0 +1,167 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "SOFTWARE", + "description": "bookrag is an agent based approach proposed to address complex document queries by planning and executing operations on a bookindex", + "source_ids": [ + 79 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "DATABASE", + "description": "bookindex is the data structure or system on which bookrag executes operations for document queries", + "source_ids": [ + 79 + ] + }, + { + "entity_name": "agent based planning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "agent based planning is a core mechanism in bookrag that formulates strategies for operations", + "source_ids": [ + 79 + ] + }, + { + "entity_name": "structured execution", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "structured execution is a core mechanism in bookrag that includes the retrieval process based on ift and generation principles", + "source_ids": [ + 79 + ] + }, + { + "entity_name": "ift", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "ift is a principle used within the structured execution mechanism of bookrag", + "source_ids": [ + 79 + ] + }, + { + "entity_name": "modal type filtering", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "modal type filtering is an operation mentioned as necessary for addressing complex real world document queries", + "source_ids": [ + 79 + ] + }, + { + "entity_name": "semantic selection", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "semantic selection is an operation mentioned as necessary for addressing complex real world document queries", + "source_ids": [ + 79 + ] + }, + { + "entity_name": "multi hop reasoning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "multi hop reasoning is an operation mentioned as necessary for addressing complex real world document queries", + "source_ids": [ + 79 + ] + }, + { + "entity_name": "generation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "generation is a process included within the structured execution mechanism of bookrag", + "source_ids": [ + 79 + ] + }, + { + "entity_name": "real world document queries", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 79 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "bookrag executes operations on the bookindex to handle document queries", + "source_ids": [ + 79 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes agent based planning as one of its two core mechanisms to formulate strategies", + "source_ids": [ + 79 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "structured execution", + "relation_name": "", + "weight": 9.0, + "description": "bookrag utilizes structured execution as one of its two core mechanisms to handle retrieval and generation", + "source_ids": [ + 79 + ] + }, + { + "src_entity_name": "structured execution", + "tgt_entity_name": "ift", + "relation_name": "", + "weight": 8.0, + "description": "structured execution includes the retrieval process under the principles of ift", + "source_ids": [ + 79 + ] + }, + { + "src_entity_name": "real world document queries", + "tgt_entity_name": "modal type filtering", + "relation_name": "", + "weight": 8.0, + "description": "real world document queries necessitate operations like modal type filtering", + "source_ids": [ + 79 + ] + }, + { + "src_entity_name": "real world document queries", + "tgt_entity_name": "semantic selection", + "relation_name": "", + "weight": 8.0, + "description": "real world document queries necessitate operations like semantic selection", + "source_ids": [ + 79 + ] + }, + { + "src_entity_name": "real world document queries", + "tgt_entity_name": "multi hop reasoning", + "relation_name": "", + "weight": 8.0, + "description": "real world document queries necessitate operations like multi hop reasoning", + "source_ids": [ + 79 + ] + }, + { + "src_entity_name": "structured execution", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 9.0, + "description": "structured execution includes the generation process as part of its workflow", + "source_ids": [ + 79 + ] + } + ], + "node_idx": 79 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_8.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_8.json new file mode 100644 index 0000000..2ad887e --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_8.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "1 introduction", + "entity_type": "SECTION_TITLE", + "description": "As the opening section of the paper 'BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents', this section introduces the motivation behind Retrieval-Augmented Generation (RAG), highlights limitations in existing approaches regarding hierarchical documents, and presents the proposed BookRAG framework and its key components like BookIndex.", + "source_ids": [ + 8 + ] + } + ], + "relations": [], + "node_idx": 8 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_80.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_80.json new file mode 100644 index 0000000..2a4607f --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_80.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "5.1 overall workflow", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Agent-Based Retrieval', this section introduces the general operational flow of the BookRAG system, outlining how it intelligently plans and executes operations on the BookIndex to handle complex document queries.", + "source_ids": [ + 80 + ] + } + ], + "relations": [], + "node_idx": 80 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_81.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_81.json new file mode 100644 index 0000000..f7c5d91 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_81.json @@ -0,0 +1,51 @@ +{ + "entities": [ + { + "entity_name": "figure 3", + "entity_type": "IMAGE", + "description": "figure 3 is an illustration depicting the workflow of agent based retrieval", + "source_ids": [ + 81 + ] + }, + { + "entity_name": "agent based retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "agent based retrieval is a workflow designed to address users queries systematically", + "source_ids": [ + 81 + ] + }, + { + "entity_name": "three stage pipeline", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "the three stage pipeline is the structure of the workflow used to address users queries", + "source_ids": [ + 81 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 3", + "tgt_entity_name": "agent based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "figure 3 illustrates the workflow of agent based retrieval", + "source_ids": [ + 81 + ] + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "three stage pipeline", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval follows a three stage pipeline to address queries", + "source_ids": [ + 81 + ] + } + ], + "node_idx": 81 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_82.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_82.json new file mode 100644 index 0000000..e72a76e --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_82.json @@ -0,0 +1,187 @@ +{ + "entities": [ + { + "entity_name": "agent based planning", + "entity_type": "TASK_OR_PROBLEM", + "description": "agent based planning is a stage in the bookrag process that involves classification and planning for queries", + "source_ids": [ + 82 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "SOFTWARE", + "description": "bookrag is a system that performs classification and planning stages to handle queries", + "source_ids": [ + 82 + ] + }, + { + "entity_name": "classification plan", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "classification plan is the specific stage within agent based planning aimed at distinguishing query types", + "source_ids": [ + 82 + ] + }, + { + "entity_name": "transformer", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "transformer is a model architecture mentioned as an example in a query regarding long range dependencies", + "source_ids": [ + 82 + ] + }, + { + "entity_name": "rnns", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "rnns are model architectures mentioned as an example in a query regarding long range dependencies", + "source_ids": [ + 82 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "DATASET_OR_CORPUS", + "description": "bookindex is a predefined set of operators used to generate plans for retrieval and generation strategies", + "source_ids": [ + 82 + ] + }, + { + "entity_name": "query classification", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "query classification is a step within the classification plan stage that categorizes queries", + "source_ids": [ + 82 + ] + }, + { + "entity_name": "operators plan", + "entity_type": "TASK_OR_PROBLEM", + "description": "an operators plan is generated to guide retrieval and generation strategies", + "source_ids": [ + 82 + ] + }, + { + "entity_name": "retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval is a strategy guided by the operators plan", + "source_ids": [ + 82 + ] + }, + { + "entity_name": "generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "generation is a strategy guided by the operators plan", + "source_ids": [ + 82 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 10.0, + "description": "bookrag performs the agent based planning stage as its first step", + "source_ids": [ + 82 + ] + }, + { + "src_entity_name": "agent based planning", + "tgt_entity_name": "classification plan", + "relation_name": "", + "weight": 9.0, + "description": "agent based planning includes the classification plan stage to distinguish query types", + "source_ids": [ + 82 + ] + }, + { + "src_entity_name": "classification plan", + "tgt_entity_name": "transformer", + "relation_name": "", + "weight": 7.0, + "description": "the classification plan stage uses a query comparing transformer and rnns as an example", + "source_ids": [ + 82 + ] + }, + { + "src_entity_name": "classification plan", + "tgt_entity_name": "rnns", + "relation_name": "", + "weight": 7.0, + "description": "the classification plan stage uses a query comparing transformer and rnns as an example", + "source_ids": [ + 82 + ] + }, + { + "src_entity_name": "agent based planning", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 8.0, + "description": "agent based planning uses a predefined set of operators designed for the bookindex to generate plans", + "source_ids": [ + 82 + ] + }, + { + "src_entity_name": "classification plan", + "tgt_entity_name": "query classification", + "relation_name": "", + "weight": 9.0, + "description": "classification plan performs query classification to distinguish query types", + "source_ids": [ + 82 + ] + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "operators plan", + "relation_name": "", + "weight": 8.0, + "description": "the operators plan is generated based on the results of query classification", + "source_ids": [ + 82 + ] + }, + { + "src_entity_name": "operators plan", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 8.0, + "description": "the operators plan guides the retrieval strategy", + "source_ids": [ + 82 + ] + }, + { + "src_entity_name": "operators plan", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 8.0, + "description": "the operators plan guides the generation strategy", + "source_ids": [ + 82 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "operators plan", + "relation_name": "", + "weight": 7.0, + "description": "the operators plan is designed for the bookindex", + "source_ids": [ + 82 + ] + } + ], + "node_idx": 82 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_83.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_83.json new file mode 100644 index 0000000..130c921 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_83.json @@ -0,0 +1,189 @@ +{ + "entities": [ + { + "entity_name": "figure 3", + "entity_type": "IMAGE", + "description": "figure 3 is an image illustrating the general workflow of agent based retrieval in bookrag", + "source_ids": [ + 83 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "SOFTWARE", + "description": "bookrag is a system that utilizes agent based planning retrieval and generation processes", + "source_ids": [ + 83 + ] + }, + { + "entity_name": "agent based retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "agent based retrieval is a workflow containing planning retrieval and generation processes used in bookrag", + "source_ids": [ + 83 + ] + }, + { + "entity_name": "agent based planning", + "entity_type": "TASK_OR_PROBLEM", + "description": "agent based planning is a process component within the agent based retrieval workflow of bookrag", + "source_ids": [ + 83 + ] + }, + { + "entity_name": "retrieval", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval is a process component within the agent based retrieval workflow of bookrag", + "source_ids": [ + 83 + ] + }, + { + "entity_name": "generation", + "entity_type": "TASK_OR_PROBLEM", + "description": "generation is a process component within the agent based retrieval workflow of bookrag", + "source_ids": [ + 83 + ] + }, + { + "entity_name": "workflow", + "entity_type": "TASK_OR_PROBLEM", + "description": "workflow refers to the general process flow of agent based retrieval in bookrag", + "source_ids": [ + 83 + ] + }, + { + "entity_name": "planning", + "entity_type": "TASK_OR_PROBLEM", + "description": "planning is a specific step within the agent based retrieval process", + "source_ids": [ + 83 + ] + }, + { + "entity_name": "generation processes", + "entity_type": "TASK_OR_PROBLEM", + "description": "generation processes are a component of the agent based retrieval workflow in bookrag", + "source_ids": [ + 83 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 3", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 10.0, + "description": "figure 3 depicts the workflow of bookrag", + "source_ids": [ + 83 + ] + }, + { + "src_entity_name": "figure 3", + "tgt_entity_name": "agent based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "figure 3 illustrates the general workflow of agent based retrieval", + "source_ids": [ + 83 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "bookrag contains the agent based retrieval workflow", + "source_ids": [ + 83 + ] + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval contains the agent based planning process", + "source_ids": [ + 83 + ] + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "retrieval", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval contains the retrieval process", + "source_ids": [ + 83 + ] + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "generation", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval contains the generation process", + "source_ids": [ + 83 + ] + }, + { + "src_entity_name": "figure 3", + "tgt_entity_name": "workflow", + "relation_name": "", + "weight": 10.0, + "description": "figure 3 depicts the general workflow", + "source_ids": [ + 83 + ] + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "workflow", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval is described as a general workflow", + "source_ids": [ + 83 + ] + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "planning", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval includes planning as a process", + "source_ids": [ + 83 + ] + }, + { + "src_entity_name": "agent based retrieval", + "tgt_entity_name": "generation processes", + "relation_name": "", + "weight": 9.0, + "description": "agent based retrieval includes generation processes as a component", + "source_ids": [ + 83 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "workflow", + "relation_name": "", + "weight": 9.0, + "description": "bookrag contains the general workflow of agent based retrieval", + "source_ids": [ + 83 + ] + } + ], + "node_idx": 83 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_84.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_84.json new file mode 100644 index 0000000..9adb90a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_84.json @@ -0,0 +1,105 @@ +{ + "entities": [ + { + "entity_name": "cref='#/texts/89'", + "entity_type": "IMAGE", + "description": "A flowchart diagram illustrating a three-stage process involving planning, retrieval, and generation to answer a question.", + "source_ids": [ + 84 + ] + }, + { + "entity_name": "question", + "entity_type": "TASK_OR_PROBLEM", + "description": "The input trigger for the system, represented by an icon of a person with a question mark.", + "source_ids": [ + 84 + ] + }, + { + "entity_name": "agent-based planning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The first stage of the process, which handles classification and planning tasks.", + "source_ids": [ + 84 + ] + }, + { + "entity_name": "retrieval process", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The second stage of the process, utilizing scent or filter-based mechanisms to retrieve information.", + "source_ids": [ + 84 + ] + }, + { + "entity_name": "generation process", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "The third stage of the process, responsible for analysis and merging data to form the output.", + "source_ids": [ + 84 + ] + }, + { + "entity_name": "answer", + "entity_type": "TASK_OR_PROBLEM", + "description": "The final output of the system, symbolized by a lightbulb icon.", + "source_ids": [ + 84 + ] + } + ], + "relations": [ + { + "src_entity_name": "cref='#/texts/89'", + "tgt_entity_name": "question", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/89' related to Question", + "source_ids": [ + 84 + ] + }, + { + "src_entity_name": "cref='#/texts/89'", + "tgt_entity_name": "agent-based planning", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/89' related to Agent-based Planning", + "source_ids": [ + 84 + ] + }, + { + "src_entity_name": "cref='#/texts/89'", + "tgt_entity_name": "retrieval process", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/89' related to Retrieval Process", + "source_ids": [ + 84 + ] + }, + { + "src_entity_name": "cref='#/texts/89'", + "tgt_entity_name": "generation process", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/89' related to Generation Process", + "source_ids": [ + 84 + ] + }, + { + "src_entity_name": "cref='#/texts/89'", + "tgt_entity_name": "answer", + "relation_name": "", + "weight": 9.0, + "description": "Image entity cref='#/texts/89' related to Answer", + "source_ids": [ + 84 + ] + } + ], + "node_idx": 84 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_85.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_85.json new file mode 100644 index 0000000..c89e19e --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_85.json @@ -0,0 +1,215 @@ +{ + "entities": [ + { + "entity_name": "retrieval process", + "entity_type": "TASK_OR_PROBLEM", + "description": "retrieval process is a stage guided by an operator plan that executes scent filter based retrieval", + "source_ids": [ + 85 + ] + }, + { + "entity_name": "scent filter based retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "scent filter based retrieval is the specific method executed during the retrieval process to find information", + "source_ids": [ + 85 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "DATASET_OR_CORPUS", + "description": "bookindex is a data structure represented as b t g m that is navigated during the retrieval process", + "source_ids": [ + 85 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "SOFTWARE", + "description": "bookrag is the system that obtains the retrieval set of highly relevant information blocks after reasoning", + "source_ids": [ + 85 + ] + }, + { + "entity_name": "t", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "t is a component of the bookindex structure", + "source_ids": [ + 85 + ] + }, + { + "entity_name": "g", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "g is a component of the bookindex structure containing relevant entities", + "source_ids": [ + 85 + ] + }, + { + "entity_name": "m", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "m is a component of the bookindex structure", + "source_ids": [ + 85 + ] + }, + { + "entity_name": "operator plan", + "entity_type": "TASK_OR_PROBLEM", + "description": "operator plan is the guiding document or set of instructions for the retrieval process", + "source_ids": [ + 85 + ] + }, + { + "entity_name": "modal type", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "modal type is a specific filter used to refine the selection of information during retrieval", + "source_ids": [ + 85 + ] + }, + { + "entity_name": "relevant entities", + "entity_type": "DATASET_OR_CORPUS", + "description": "relevant entities are the items found in g that are followed during scent based retrieval", + "source_ids": [ + 85 + ] + }, + { + "entity_name": "information blocks", + "entity_type": "DATASET_OR_CORPUS", + "description": "information blocks are the highly relevant units of data retrieved by bookrag", + "source_ids": [ + 85 + ] + } + ], + "relations": [ + { + "src_entity_name": "retrieval process", + "tgt_entity_name": "scent filter based retrieval", + "relation_name": "", + "weight": 10.0, + "description": "the retrieval process executes the scent filter based retrieval method", + "source_ids": [ + 85 + ] + }, + { + "src_entity_name": "retrieval process", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "the retrieval process navigates the bookindex to find information", + "source_ids": [ + 85 + ] + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "scent filter based retrieval utilizes the bookindex to find information", + "source_ids": [ + 85 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "t", + "relation_name": "", + "weight": 8.0, + "description": "t is a component of the bookindex structure", + "source_ids": [ + 85 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 8.0, + "description": "g is a component of the bookindex structure", + "source_ids": [ + 85 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "m", + "relation_name": "", + "weight": 8.0, + "description": "m is a component of the bookindex structure", + "source_ids": [ + 85 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 9.0, + "description": "bookrag gets the retrieval set from the bookindex", + "source_ids": [ + 85 + ] + }, + { + "src_entity_name": "retrieval process", + "tgt_entity_name": "operator plan", + "relation_name": "", + "weight": 10.0, + "description": "the retrieval process is guided by the operator plan", + "source_ids": [ + 85 + ] + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "modal type", + "relation_name": "", + "weight": 9.0, + "description": "scent filter based retrieval employs modal type as a filter to refine selection", + "source_ids": [ + 85 + ] + }, + { + "src_entity_name": "scent filter based retrieval", + "tgt_entity_name": "relevant entities", + "relation_name": "", + "weight": 9.0, + "description": "scent based retrieval follows relevant entities in g to find information", + "source_ids": [ + 85 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "information blocks", + "relation_name": "", + "weight": 10.0, + "description": "bookrag obtains the retrieval set of highly relevant information blocks", + "source_ids": [ + 85 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "relevant entities", + "relation_name": "", + "weight": 8.0, + "description": "relevant entities are contained within the g component of the bookindex", + "source_ids": [ + 85 + ] + } + ], + "node_idx": 85 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_86.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_86.json new file mode 100644 index 0000000..c641c31 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_86.json @@ -0,0 +1,97 @@ +{ + "entities": [ + { + "entity_name": "generation process", + "entity_type": "TASK_OR_PROBLEM", + "description": "generation process is the final stage where retrieved information is synthesized and analyzed to formulate a coherent response", + "source_ids": [ + 86 + ] + }, + { + "entity_name": "analysis merging", + "entity_type": "TASK_OR_PROBLEM", + "description": "analysis merging is the specific activity within the generation stage that synthesizes fragmented evidence", + "source_ids": [ + 86 + ] + }, + { + "entity_name": "retrieved information", + "entity_type": "DATASET_OR_CORPUS", + "description": "retrieved information refers to the data collected and brought into the generation stage for processing", + "source_ids": [ + 86 + ] + }, + { + "entity_name": "fragmented pieces of evidence", + "entity_type": "DATASET_OR_CORPUS", + "description": "fragmented pieces of evidence are the specific incomplete data items that are synthesized during the process", + "source_ids": [ + 86 + ] + }, + { + "entity_name": "coherent response", + "entity_type": "PRODUCT", + "description": "coherent response is the final output formulated by the generation stage after analysis", + "source_ids": [ + 86 + ] + } + ], + "relations": [ + { + "src_entity_name": "generation process", + "tgt_entity_name": "analysis merging", + "relation_name": "", + "weight": 9.0, + "description": "analysis merging is a sub stage or activity performed within the generation process", + "source_ids": [ + 86 + ] + }, + { + "src_entity_name": "retrieved information", + "tgt_entity_name": "generation process", + "relation_name": "", + "weight": 10.0, + "description": "retrieved information enters the generation process as its primary input", + "source_ids": [ + 86 + ] + }, + { + "src_entity_name": "fragmented pieces of evidence", + "tgt_entity_name": "analysis merging", + "relation_name": "", + "weight": 9.0, + "description": "analysis merging synthesizes the fragmented pieces of evidence", + "source_ids": [ + 86 + ] + }, + { + "src_entity_name": "generation process", + "tgt_entity_name": "coherent response", + "relation_name": "", + "weight": 10.0, + "description": "the generation process formulates the coherent response as its final output", + "source_ids": [ + 86 + ] + }, + { + "src_entity_name": "analysis merging", + "tgt_entity_name": "coherent response", + "relation_name": "", + "weight": 8.0, + "description": "analysis merging contributes to the formulation of the coherent response through final analysis", + "source_ids": [ + 86 + ] + } + ], + "node_idx": 86 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_87.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_87.json new file mode 100644 index 0000000..36bbb4c --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_87.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "5.2 agent-based planning", + "entity_type": "SECTION_TITLE", + "description": "As a subsection of 'Agent-Based Retrieval' (Section 5), this section details the strategy formulation mechanism within the BookRAG framework, explaining how an agent intelligently plans operations for complex document queries.", + "source_ids": [ + 87 + ] + }, + { + "entity_name": "agent-based planning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "Refers to the specific methodology described in section 5.2 where an agent formulates strategies to handle complex retrieval tasks involving modal filtering and multi-hop reasoning.", + "source_ids": [ + 87 + ] + } + ], + "relations": [ + { + "src_entity_name": "agent-based planning", + "tgt_entity_name": "5.2 agent-based planning", + "relation_name": "", + "weight": 10.0, + "description": "The concept of 'Agent-based Planning' is the primary topic and subject matter of section 5.2.", + "source_ids": [ + 87 + ] + } + ], + "node_idx": 87 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_88.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_88.json new file mode 100644 index 0000000..18f4ebc --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_88.json @@ -0,0 +1,261 @@ +{ + "entities": [ + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a system designed to intelligently navigate a bookindex and adapt to query requirements", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "DATASET_OR_CORPUS", + "description": "bookindex is a data structure represented as t g m that bookrag navigates", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "formulator", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "formulator is one of four types of operators defined to support flexible retrieval in bookrag", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "selector", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "selector is one of four types of operators defined to support flexible retrieval in bookrag", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "reasoner", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "reasoner is one of four types of operators defined to support flexible retrieval in bookrag", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "synthesizer", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "synthesizer is one of four types of operators defined to support flexible retrieval in bookrag", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "agent", + "entity_type": "TASK_OR_PROBLEM", + "description": "the agent is an entity that performs the first step of the sequential process in bookrag", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "t", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "t is a component of the bookindex data structure b t g m", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "g", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "g is a component of the bookindex data structure b t g m", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "m", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "m is a component of the bookindex data structure b t g m", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "query categories", + "entity_type": "TASK_OR_PROBLEM", + "description": "query categories are specific requirements that bookrag adapts to using its operators", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "execution pipelines", + "entity_type": "TASK_OR_PROBLEM", + "description": "execution pipelines are formed by combining operators to support flexible retrieval", + "source_ids": [ + 88 + ] + }, + { + "entity_name": "adjustable parameters", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "adjustable parameters are attributes of the execution pipelines that can be configured", + "source_ids": [ + 88 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookrag", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "bookrag is designed to intelligently navigate the bookindex", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "formulator", + "relation_name": "", + "weight": 9.0, + "description": "bookrag defines the formulator as one of its four types of operators", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 9.0, + "description": "bookrag defines the selector as one of its four types of operators", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 9.0, + "description": "bookrag defines the reasoner as one of its four types of operators", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "synthesizer", + "relation_name": "", + "weight": 9.0, + "description": "bookrag defines the synthesizer as one of its four types of operators", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "agent", + "relation_name": "", + "weight": 8.0, + "description": "the agent performs the first step of the process within bookrag", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "t", + "relation_name": "", + "weight": 10.0, + "description": "t is a defined component within the bookindex structure", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "g", + "relation_name": "", + "weight": 10.0, + "description": "g is a defined component within the bookindex structure", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "bookindex", + "tgt_entity_name": "m", + "relation_name": "", + "weight": 10.0, + "description": "m is a defined component within the bookindex structure", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "formulator", + "tgt_entity_name": "execution pipelines", + "relation_name": "", + "weight": 9.0, + "description": "the formulator operator is combined to form tailored execution pipelines", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "selector", + "tgt_entity_name": "execution pipelines", + "relation_name": "", + "weight": 9.0, + "description": "the selector operator is combined to form tailored execution pipelines", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "reasoner", + "tgt_entity_name": "execution pipelines", + "relation_name": "", + "weight": 9.0, + "description": "the reasoner operator is combined to form tailored execution pipelines", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "synthesizer", + "tgt_entity_name": "execution pipelines", + "relation_name": "", + "weight": 9.0, + "description": "the synthesizer operator is combined to form tailored execution pipelines", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "query categories", + "relation_name": "", + "weight": 9.0, + "description": "bookrag dynamically configures operators to adapt to the specific requirements of different query categories", + "source_ids": [ + 88 + ] + }, + { + "src_entity_name": "execution pipelines", + "tgt_entity_name": "adjustable parameters", + "relation_name": "", + "weight": 8.0, + "description": "execution pipelines are created with adjustable parameters", + "source_ids": [ + 88 + ] + } + ], + "node_idx": 88 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_89.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_89.json new file mode 100644 index 0000000..fdb24cb --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_89.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "table 2", + "entity_type": "TABLE", + "description": "table 2 is a table that lists three common query categories addressed in bookrag", + "source_ids": [ + 89 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "PRODUCT", + "description": "bookrag is a product or system that addresses three common query categories", + "source_ids": [ + 89 + ] + } + ], + "relations": [ + { + "src_entity_name": "table 2", + "tgt_entity_name": "bookrag", + "relation_name": "", + "weight": 9.0, + "description": "table 2 details query categories that are addressed within the bookrag system", + "source_ids": [ + 89 + ] + } + ], + "node_idx": 89 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_9.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_9.json new file mode 100644 index 0000000..b3267d6 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_9.json @@ -0,0 +1,143 @@ +{ + "entities": [ + { + "entity_name": "large language models", + "entity_type": "TECHNOLOGY", + "description": "large language models are a type of technology that has revolutionized question answering systems", + "source_ids": [ + 9 + ] + }, + { + "entity_name": "qwen 3", + "entity_type": "PRODUCT", + "description": "qwen 3 is a specific large language model mentioned as an example", + "source_ids": [ + 9 + ] + }, + { + "entity_name": "gemini 2 5", + "entity_type": "PRODUCT", + "description": "gemini 2 5 is a specific large language model mentioned as an example", + "source_ids": [ + 9 + ] + }, + { + "entity_name": "question answering", + "entity_type": "TASK_OR_PROBLEM", + "description": "question answering is a system or task that has been revolutionized by large language models", + "source_ids": [ + 9 + ] + }, + { + "entity_name": "industry", + "entity_type": "ORGANIZATION", + "description": "the industry refers to the collective group of organizations increasingly adopting llms for qa systems", + "source_ids": [ + 9 + ] + }, + { + "entity_name": "qa system", + "entity_type": "PRODUCT", + "description": "qa system is a product built using llms to assist users and reduce manual effort", + "source_ids": [ + 9 + ] + }, + { + "entity_name": "users", + "entity_type": "PERSON", + "description": "users are the individuals who are assisted by the qa systems built by the industry", + "source_ids": [ + 9 + ] + } + ], + "relations": [ + { + "src_entity_name": "large language models", + "tgt_entity_name": "qwen 3", + "relation_name": "", + "weight": 10.0, + "description": "qwen 3 is identified as an example of a large language model", + "source_ids": [ + 9 + ] + }, + { + "src_entity_name": "large language models", + "tgt_entity_name": "gemini 2 5", + "relation_name": "", + "weight": 10.0, + "description": "gemini 2 5 is identified as an example of a large language model", + "source_ids": [ + 9 + ] + }, + { + "src_entity_name": "large language models", + "tgt_entity_name": "question answering", + "relation_name": "", + "weight": 9.0, + "description": "large language models have revolutionized the question answering system", + "source_ids": [ + 9 + ] + }, + { + "src_entity_name": "industry", + "tgt_entity_name": "large language models", + "relation_name": "", + "weight": 8.0, + "description": "the industry is adopting large language models to build question answering systems", + "source_ids": [ + 9 + ] + }, + { + "src_entity_name": "industry", + "tgt_entity_name": "question answering", + "relation_name": "", + "weight": 7.0, + "description": "the industry is building question answering systems to assist users and reduce manual effort", + "source_ids": [ + 9 + ] + }, + { + "src_entity_name": "industry", + "tgt_entity_name": "qa system", + "relation_name": "", + "weight": 9.0, + "description": "the industry builds qa systems to assist users and reduce manual effort", + "source_ids": [ + 9 + ] + }, + { + "src_entity_name": "qa system", + "tgt_entity_name": "users", + "relation_name": "", + "weight": 8.0, + "description": "qa systems are designed to assist users", + "source_ids": [ + 9 + ] + }, + { + "src_entity_name": "large language models", + "tgt_entity_name": "qa system", + "relation_name": "", + "weight": 9.0, + "description": "large language models are used to build qa systems", + "source_ids": [ + 9 + ] + } + ], + "node_idx": 9 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_90.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_90.json new file mode 100644 index 0000000..516a19a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_90.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "table: cref='#/texts/95'...", + "entity_type": "TABLE", + "description": "A data table described as: cref='#/texts/95'", + "source_ids": [ + 90 + ] + } + ], + "relations": [], + "node_idx": 90 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_91.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_91.json new file mode 100644 index 0000000..8a3b676 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_91.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "6", + "entity_type": "MEASUREMENT", + "description": "6 is a numerical value mentioned in the text potentially representing a count or measurement", + "source_ids": [ + 91 + ] + } + ], + "relations": [], + "node_idx": 91 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_92.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_92.json new file mode 100644 index 0000000..47c34db --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_92.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "operator set", + "entity_type": "TASK_OR_PROBLEM", + "description": "operator set is a task or problem mentioned in the text likely referring to a specific set of operators in a technical context", + "source_ids": [ + 92 + ] + } + ], + "relations": [], + "node_idx": 92 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_93.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_93.json new file mode 100644 index 0000000..0f21d14 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_93.json @@ -0,0 +1,223 @@ +{ + "entities": [ + { + "entity_name": "figure 4", + "entity_type": "IMAGE", + "description": "figure 4 is an image depicting the bookrag operator library and an execution example", + "source_ids": [ + 93 + ] + }, + { + "entity_name": "bookrag operator library", + "entity_type": "SOFTWARE", + "description": "the bookrag operator library is a software component containing four operator types", + "source_ids": [ + 93 + ] + }, + { + "entity_name": "mmlongbench dataset", + "entity_type": "DATASET_OR_CORPUS", + "description": "the mmlongbench dataset is the source of the execution example shown in the text", + "source_ids": [ + 93 + ] + }, + { + "entity_name": "formulator", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "formulator is one of the four operator types depicted in the bookrag operator library", + "source_ids": [ + 93 + ] + }, + { + "entity_name": "selector", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "selector is one of the four operator types depicted in the bookrag operator library", + "source_ids": [ + 93 + ] + }, + { + "entity_name": "reasoner", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "reasoner is one of the four operator types depicted in the bookrag operator library", + "source_ids": [ + 93 + ] + }, + { + "entity_name": "synthesizer", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "synthesizer is one of the four operator types depicted in the bookrag operator library", + "source_ids": [ + 93 + ] + }, + { + "entity_name": "single hop", + "entity_type": "TASK_OR_PROBLEM", + "description": "single hop is a type of query for which an execution trace is demonstrated", + "source_ids": [ + 93 + ] + }, + { + "entity_name": "operator", + "entity_type": "MODEL_OR_ARCHITECTURE", + "description": "operator is a general term for the components formulator selector reasoner synthesizer within the bookrag system", + "source_ids": [ + 93 + ] + }, + { + "entity_name": "execution trace", + "entity_type": "TASK_OR_PROBLEM", + "description": "execution trace is the step by step record of the agent based planning and operator execution shown in the text", + "source_ids": [ + 93 + ] + }, + { + "entity_name": "agent based planning", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "agent based planning is the method used for planning demonstrated in the execution trace", + "source_ids": [ + 93 + ] + }, + { + "entity_name": "step by step operator execution", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "step by step operator execution is the method of executing operators demonstrated in the text", + "source_ids": [ + 93 + ] + } + ], + "relations": [ + { + "src_entity_name": "figure 4", + "tgt_entity_name": "bookrag operator library", + "relation_name": "", + "weight": 10.0, + "description": "figure 4 visually depicts the bookrag operator library", + "source_ids": [ + 93 + ] + }, + { + "src_entity_name": "figure 4", + "tgt_entity_name": "mmlongbench dataset", + "relation_name": "", + "weight": 9.0, + "description": "figure 4 shows an execution example derived from the mmlongbench dataset", + "source_ids": [ + 93 + ] + }, + { + "src_entity_name": "bookrag operator library", + "tgt_entity_name": "formulator", + "relation_name": "", + "weight": 10.0, + "description": "the bookrag operator library contains the formulator operator type", + "source_ids": [ + 93 + ] + }, + { + "src_entity_name": "bookrag operator library", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 10.0, + "description": "the bookrag operator library contains the selector operator type", + "source_ids": [ + 93 + ] + }, + { + "src_entity_name": "bookrag operator library", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 10.0, + "description": "the bookrag operator library contains the reasoner operator type", + "source_ids": [ + 93 + ] + }, + { + "src_entity_name": "bookrag operator library", + "tgt_entity_name": "synthesizer", + "relation_name": "", + "weight": 10.0, + "description": "the bookrag operator library contains the synthesizer operator type", + "source_ids": [ + 93 + ] + }, + { + "src_entity_name": "figure 4", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 9.0, + "description": "figure 4 demonstrates an execution trace for a single hop query", + "source_ids": [ + 93 + ] + }, + { + "src_entity_name": "bookrag operator library", + "tgt_entity_name": "operator", + "relation_name": "", + "weight": 8.0, + "description": "the bookrag operator library is composed of specific operators", + "source_ids": [ + 93 + ] + }, + { + "src_entity_name": "figure 4", + "tgt_entity_name": "execution trace", + "relation_name": "", + "weight": 9.0, + "description": "figure 4 contains an execution trace for a single hop query", + "source_ids": [ + 93 + ] + }, + { + "src_entity_name": "execution trace", + "tgt_entity_name": "agent based planning", + "relation_name": "", + "weight": 9.0, + "description": "the execution trace demonstrates agent based planning", + "source_ids": [ + 93 + ] + }, + { + "src_entity_name": "execution trace", + "tgt_entity_name": "step by step operator execution", + "relation_name": "", + "weight": 9.0, + "description": "the execution trace demonstrates step by step operator execution", + "source_ids": [ + 93 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "execution trace", + "relation_name": "", + "weight": 10.0, + "description": "the execution trace is specifically for a single hop query", + "source_ids": [ + 93 + ] + } + ], + "node_idx": 93 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_94.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_94.json new file mode 100644 index 0000000..8d87aa7 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_94.json @@ -0,0 +1,537 @@ +{ + "entities": [ + { + "entity_name": "operator-set", + "entity_type": "IMAGE", + "description": "A diagram illustrating a framework for processing queries, divided into an 'Operators' section and an 'Execution example' section.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "extract", + "entity_type": "TASK_OR_PROBLEM", + "description": "The initial step in the operator set where questions are decomposed to identify entities.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "decompose", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A process within the Extract phase that breaks down a query into sub-queries.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "entities", + "entity_type": "DATASET_OR_CORPUS", + "description": "The output of the Extract phase, representing distinct items identified from the input text.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "sub-queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "Smaller queries generated during the decomposition process.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "formulator", + "entity_type": "SYSTEM_COMPONENT", + "description": "The component or agent responsible for the extraction and decomposition steps.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "filter", + "entity_type": "TASK_OR_PROBLEM", + "description": "An operator that processes data structures like trees to select relevant information.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "select", + "entity_type": "TASK_OR_PROBLEM", + "description": "The action performed by the Filter operator to choose specific elements.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "selector", + "entity_type": "SYSTEM_COMPONENT", + "description": "The component responsible for filtering and selecting data based on criteria.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "reason", + "entity_type": "TASK_OR_PROBLEM", + "description": "An operator that takes Graph and Text inputs to perform reasoning tasks.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "graph", + "entity_type": "DATA_STRUCTURE", + "description": "A visual representation of data used as input for the Reason operator.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "text", + "entity_type": "DATA_STRUCTURE", + "description": "Raw textual data used as input for the Reason operator.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "s:", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "A label indicating a score or similarity matrix with values such as 0.6, 0.5, 0.4.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "skyline", + "entity_type": "TASK_OR_PROBLEM", + "description": "An operator that processes ranked lists (S1, S2) to find optimal solutions.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "reasoner", + "entity_type": "SYSTEM_COMPONENT", + "description": "The component executing the Reason and Skyline operations.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "map", + "entity_type": "TASK_OR_PROBLEM", + "description": "An operator that transforms data using icons representing different formats.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "reduce", + "entity_type": "TASK_OR_PROBLEM", + "description": "An operator that combines multiple inputs into a single result.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "synthesizer", + "entity_type": "SYSTEM_COMPONENT", + "description": "The final component that aggregates results into a coherent answer.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "execution example", + "entity_type": "SECTION_TITLE", + "description": "A subsection of the diagram showing a concrete application of the operator set.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "q: what is the type of car in the ranking prompt example?", + "entity_type": "TASK_OR_PROBLEM", + "description": "The specific user question being processed in the execution example.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "planning", + "entity_type": "TASK_OR_PROBLEM", + "description": "The phase where a plan is formulated to solve the query.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "simple query...", + "entity_type": "TASK_OR_PROBLEM", + "description": "A classification of the input query.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "operator plan", + "entity_type": "TASK_OR_PROBLEM", + "description": "The sequence of operators chosen to solve the problem: Extract->Select->Reason->Skyline->Map...", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "car", + "entity_type": "PRODUCT", + "description": "An entity extracted from the question.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "ranking prompt", + "entity_type": "BOOK", + "description": "An entity mentioned in the question context.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "method", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "A node in the planning graph representing the method to be used.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "method and its descendants", + "entity_type": "SECTION_TITLE", + "description": "A grouping of nodes related to the Method in the execution flow.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "a: based on the provided information...", + "entity_type": "TASK_OR_PROBLEM", + "description": "The final answer generated by the system.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "mercedes-benz e-class sedan", + "entity_type": "VEHICLE", + "description": "The specific car type identified as the correct answer in the example.", + "source_ids": [ + 94 + ] + }, + { + "entity_name": "image cref='#/texts/98'", + "entity_type": "UNKNOWN", + "description": "", + "source_ids": [ + 94 + ] + } + ], + "relations": [ + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "operator-set", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Operator-Set", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "extract", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Extract", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "decompose", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Decompose", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "entities", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Entities", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "sub-queries", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Sub-queries", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "formulator", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Formulator", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "filter", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Filter", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "select", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Select", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "selector", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Selector", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "reason", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Reason", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "graph", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Graph", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "text", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Text", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "s:", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to S:", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "skyline", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Skyline", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "reasoner", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Reasoner", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "map", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Map", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "reduce", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Reduce", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "synthesizer", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Synthesizer", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "execution example", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Execution example", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "q: what is the type of car in the ranking prompt example?", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Q: What is the type of car in the Ranking Prompt example?", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "planning", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Planning", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "simple query...", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Simple query...", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "operator plan", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Operator Plan", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "car", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Car", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "ranking prompt", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Ranking Prompt", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "method", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Method", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "method and its descendants", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Method and its Descendants", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "a: based on the provided information...", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to A: Based on the provided information...", + "source_ids": [ + 94 + ] + }, + { + "src_entity_name": "image cref='#/texts/98'", + "tgt_entity_name": "mercedes-benz e-class sedan", + "relation_name": "", + "weight": 9.0, + "description": "Image entity Image cref='#/texts/98' related to Mercedes-Benz E-Class Sedan", + "source_ids": [ + 94 + ] + } + ], + "node_idx": 94 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_95.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_95.json new file mode 100644 index 0000000..e859666 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_95.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "entity_name": "query classification", + "entity_type": "TASK_OR_PROBLEM", + "description": "query classification is a task mentioned in the text used to determine the appropriate solution strategy", + "source_ids": [ + 95 + ] + }, + { + "entity_name": "operator plan", + "entity_type": "PRODUCT", + "description": "operator plan is a specific output generated after determining the solution strategy", + "source_ids": [ + 95 + ] + } + ], + "relations": [ + { + "src_entity_name": "query classification", + "tgt_entity_name": "operator plan", + "relation_name": "", + "weight": 9.0, + "description": "query classification is performed to generate a specific operator plan", + "source_ids": [ + 95 + ] + } + ], + "node_idx": 95 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_96.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_96.json new file mode 100644 index 0000000..133f856 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_96.json @@ -0,0 +1,267 @@ +{ + "entities": [ + { + "entity_name": "query classification", + "entity_type": "TASK_OR_PROBLEM", + "description": "query classification is a task focused on enabling agent strategy selection by categorizing queries based on complexity", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "single hop", + "entity_type": "EVENT", + "description": "single hop is a query category requiring a single piece of information retrieved via a scent based retrieval operation", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "multi hop", + "entity_type": "EVENT", + "description": "multi hop is a query category defined by its intrinsic complexity and operational demands", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "global aggregation", + "entity_type": "EVENT", + "description": "global aggregation is a query category necessitating analysis under multiple filtering conditions", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "table 2", + "entity_type": "TABLE", + "description": "table 2 is a reference in the text that defines the three representative query categories", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "scent based retrieval", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "scent based retrieval is a method used to retrieve a single piece of information for single hop queries", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "filter aggregation", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "filter aggregation is a sequence of operations used to analyze content under multiple filtering conditions for global aggregation queries", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "bookrag", + "entity_type": "SOFTWARE", + "description": "bookrag is a system designed to be extensible for resolving a broader range of query types", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "agent strategy selection", + "entity_type": "TASK_OR_PROBLEM", + "description": "agent strategy selection is a process enabled by query classification to determine the appropriate solution strategy", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "intrinsic complexity", + "entity_type": "CONCEPT", + "description": "intrinsic complexity is an attribute used to define the query categories", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "operational demands", + "entity_type": "CONCEPT", + "description": "operational demands are factors used to define the query categories alongside intrinsic complexity", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "solution strategy", + "entity_type": "CONCEPT", + "description": "solution strategy refers to the different approaches required for each query category", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "filtering conditions", + "entity_type": "CONCEPT", + "description": "filtering conditions are multiple criteria used in the analysis of global aggregation queries", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "document", + "entity_type": "OBJECT", + "description": "document refers to the source material where content is analyzed during global aggregation queries", + "source_ids": [ + 96 + ] + }, + { + "entity_name": "additional operators", + "entity_type": "SOFTWARE", + "description": "additional operators are components integrated into bookrag to extend its capabilities", + "source_ids": [ + 96 + ] + } + ], + "relations": [ + { + "src_entity_name": "query classification", + "tgt_entity_name": "single hop", + "relation_name": "", + "weight": 9.0, + "description": "query classification defines single hop as one of its three representative categories", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "multi hop", + "relation_name": "", + "weight": 9.0, + "description": "query classification defines multi hop as one of its three representative categories", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "global aggregation", + "relation_name": "", + "weight": 9.0, + "description": "query classification defines global aggregation as one of its three representative categories", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "scent based retrieval", + "relation_name": "", + "weight": 8.0, + "description": "single hop queries typically require a scent based retrieval operation", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "global aggregation", + "tgt_entity_name": "filter aggregation", + "relation_name": "", + "weight": 8.0, + "description": "global aggregation queries usually involve a sequence of filter aggregation operations", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "query classification", + "relation_name": "", + "weight": 7.0, + "description": "bookrag is designed to resolve a broader range of query types including those defined by the classification", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "agent strategy selection", + "relation_name": "", + "weight": 9.0, + "description": "query classification enables agent strategy selection", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "single hop", + "tgt_entity_name": "intrinsic complexity", + "relation_name": "", + "weight": 7.0, + "description": "single hop is defined by its intrinsic complexity and operational demands", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "multi hop", + "tgt_entity_name": "intrinsic complexity", + "relation_name": "", + "weight": 7.0, + "description": "multi hop is defined by its intrinsic complexity and operational demands", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "global aggregation", + "tgt_entity_name": "intrinsic complexity", + "relation_name": "", + "weight": 7.0, + "description": "global aggregation is defined by its intrinsic complexity and operational demands", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "query classification", + "tgt_entity_name": "solution strategy", + "relation_name": "", + "weight": 8.0, + "description": "each category defined by query classification requires a different solution strategy", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "global aggregation", + "tgt_entity_name": "filtering conditions", + "relation_name": "", + "weight": 8.0, + "description": "global aggregation necessitates analyzing content under multiple filtering conditions", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "global aggregation", + "tgt_entity_name": "document", + "relation_name": "", + "weight": 7.0, + "description": "global aggregation involves analyzing content across various parts of the document", + "source_ids": [ + 96 + ] + }, + { + "src_entity_name": "bookrag", + "tgt_entity_name": "additional operators", + "relation_name": "", + "weight": 8.0, + "description": "bookrag resolves broader query types by integrating additional operators", + "source_ids": [ + 96 + ] + } + ], + "node_idx": 96 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_97.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_97.json new file mode 100644 index 0000000..6d5b244 --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_97.json @@ -0,0 +1,159 @@ +{ + "entities": [ + { + "entity_name": "bookindex operators", + "entity_type": "TASK_OR_PROBLEM", + "description": "bookindex operators are a set of strategies designed to execute tasks identified by classification", + "source_ids": [ + 97 + ] + }, + { + "entity_name": "o", + "entity_type": "TASK_OR_PROBLEM", + "description": "o represents the set of operators tailored for the bookindex", + "source_ids": [ + 97 + ] + }, + { + "entity_name": "bookindex", + "entity_type": "PRODUCT", + "description": "bookindex is a system defined by the tuple t g m for which operators are designed", + "source_ids": [ + 97 + ] + }, + { + "entity_name": "figure 4 a", + "entity_type": "IMAGE", + "description": "figure 4 a is a visual depiction of the operators", + "source_ids": [ + 97 + ] + }, + { + "entity_name": "table 3", + "entity_type": "TABLE", + "description": "table 3 provides detailed information about the operators", + "source_ids": [ + 97 + ] + }, + { + "entity_name": "agent", + "entity_type": "TASK_OR_PROBLEM", + "description": "the agent is the entity that employs the operators for diverse query categories", + "source_ids": [ + 97 + ] + }, + { + "entity_name": "classification", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "classification is the method used to identify the strategies executed by the operators", + "source_ids": [ + 97 + ] + }, + { + "entity_name": "query categories", + "entity_type": "TASK_OR_PROBLEM", + "description": "query categories are the diverse groups of queries for which the agent employs the operators", + "source_ids": [ + 97 + ] + }, + { + "entity_name": "figure 4", + "entity_type": "IMAGE", + "description": "figure 4 is a visual element referenced in the text specifically part a", + "source_ids": [ + 97 + ] + } + ], + "relations": [ + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "o", + "relation_name": "", + "weight": 9.0, + "description": "bookindex operators are represented by the set o tailored for the bookindex", + "source_ids": [ + 97 + ] + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "bookindex", + "relation_name": "", + "weight": 10.0, + "description": "bookindex operators are designed specifically for the bookindex system", + "source_ids": [ + 97 + ] + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "figure 4 a", + "relation_name": "", + "weight": 8.0, + "description": "bookindex operators are visually depicted in figure 4 a", + "source_ids": [ + 97 + ] + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "table 3", + "relation_name": "", + "weight": 8.0, + "description": "bookindex operators are detailed in table 3", + "source_ids": [ + 97 + ] + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "agent", + "relation_name": "", + "weight": 9.0, + "description": "the agent employs the bookindex operators for diverse query categories", + "source_ids": [ + 97 + ] + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "classification", + "relation_name": "", + "weight": 9.0, + "description": "bookindex operators are designed to execute strategies identified by classification", + "source_ids": [ + 97 + ] + }, + { + "src_entity_name": "agent", + "tgt_entity_name": "query categories", + "relation_name": "", + "weight": 8.0, + "description": "the agent employs operators for diverse query categories", + "source_ids": [ + 97 + ] + }, + { + "src_entity_name": "bookindex operators", + "tgt_entity_name": "figure 4", + "relation_name": "", + "weight": 7.0, + "description": "bookindex operators are visually depicted in figure 4", + "source_ids": [ + 97 + ] + } + ], + "node_idx": 97 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_98.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_98.json new file mode 100644 index 0000000..815caec --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_98.json @@ -0,0 +1,251 @@ +{ + "entities": [ + { + "entity_name": "formulator", + "entity_type": "TASK_OR_PROBLEM", + "description": "formulator is a category of llm based operators that prepare queries for execution", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "decompose", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "decompose is a method that breaks a complex query into simpler actionable sub queries", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "extract", + "entity_type": "METHOD_OR_TECHNIQUE", + "description": "extract is a method that employs an llm to identify key entities from query text and link them to a knowledge graph", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "llm", + "entity_type": "TECHNOLOGY", + "description": "llm refers to large language models used as operators in the formulator category", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "kg", + "entity_type": "SOFTWARE", + "description": "kg refers to the knowledge graph where entities are linked", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "qs", + "entity_type": "TASK_OR_PROBLEM", + "description": "qs represents the set of simpler actionable sub queries generated by the decompose method", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "eq", + "entity_type": "TASK_OR_PROBLEM", + "description": "eq represents the set of key entities identified by the extract method", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "pdec", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "pdec is a parameter used in the llm function to generate sub queries", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "pext", + "entity_type": "PARAMETER_OR_VARIABLE", + "description": "pext is a parameter used in the llm function to identify key entities", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "complex query", + "entity_type": "TASK_OR_PROBLEM", + "description": "complex query is the type of query that the decompose method breaks down into simpler sub queries", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "sub queries", + "entity_type": "TASK_OR_PROBLEM", + "description": "sub queries are the simpler actionable components resulting from breaking down a complex query", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "query text", + "entity_type": "TASK_OR_PROBLEM", + "description": "query text is the source material from which the extract method identifies key entities", + "source_ids": [ + 98 + ] + }, + { + "entity_name": "entities", + "entity_type": "TASK_OR_PROBLEM", + "description": "entities are the key items identified in the query text and linked to the knowledge graph", + "source_ids": [ + 98 + ] + } + ], + "relations": [ + { + "src_entity_name": "formulator", + "tgt_entity_name": "decompose", + "relation_name": "", + "weight": 9.0, + "description": "decompose is included as a category within the formulator operators", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "formulator", + "tgt_entity_name": "extract", + "relation_name": "", + "weight": 9.0, + "description": "extract is included as a category within the formulator operators", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "decompose", + "tgt_entity_name": "qs", + "relation_name": "", + "weight": 10.0, + "description": "decompose generates the set of sub queries qs", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "eq", + "relation_name": "", + "weight": 10.0, + "description": "extract identifies the key entities eq", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "kg", + "relation_name": "", + "weight": 9.0, + "description": "extract links identified entities to the knowledge graph kg", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "decompose", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "decompose employs an llm to perform its function", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 8.0, + "description": "extract employs an llm to perform its function", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "qs", + "tgt_entity_name": "pdec", + "relation_name": "", + "weight": 7.0, + "description": "qs is generated using the parameter pdec in the llm function", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "eq", + "tgt_entity_name": "pext", + "relation_name": "", + "weight": 7.0, + "description": "eq is generated using the parameter pext in the llm function", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "decompose", + "tgt_entity_name": "complex query", + "relation_name": "", + "weight": 10.0, + "description": "decompose takes a complex query as its input to break it down", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "decompose", + "tgt_entity_name": "sub queries", + "relation_name": "", + "weight": 10.0, + "description": "decompose produces sub queries as its output", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "query text", + "relation_name": "", + "weight": 10.0, + "description": "extract analyzes the query text to find key entities", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "extract", + "tgt_entity_name": "entities", + "relation_name": "", + "weight": 10.0, + "description": "extract identifies entities from the query text", + "source_ids": [ + 98 + ] + }, + { + "src_entity_name": "formulator", + "tgt_entity_name": "llm", + "relation_name": "", + "weight": 9.0, + "description": "formulators are defined as llm based operators", + "source_ids": [ + 98 + ] + } + ], + "node_idx": 98 +} \ No newline at end of file diff --git a/e2e_test_output/kg_extractor_res/kg_extractor_res_99.json b/e2e_test_output/kg_extractor_res/kg_extractor_res_99.json new file mode 100644 index 0000000..c1c367a --- /dev/null +++ b/e2e_test_output/kg_extractor_res/kg_extractor_res_99.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "entity_name": "formula (2)", + "entity_type": "EQUATION_OR_FORMULA", + "description": "An equation defining the output Q(s) as a set of query vectors generated by an LLM. LaTeX: 𝑄 𝑠 = LLM ( 𝑃 𝐷𝑒𝑐 , 𝑞 ) = { 𝑞 , 𝑞 1 2 , . . . , 𝑞 𝑘 } (2)", + "source_ids": [ + 99 + ] + } + ], + "relations": [], + "node_idx": 99 +} \ No newline at end of file diff --git a/e2e_test_output/kg_vdb/9a19ee1d-d66c-4220-b75b-ced6a57da5ad/data_level0.bin b/e2e_test_output/kg_vdb/9a19ee1d-d66c-4220-b75b-ced6a57da5ad/data_level0.bin new file mode 100644 index 0000000..38fff23 Binary files /dev/null and b/e2e_test_output/kg_vdb/9a19ee1d-d66c-4220-b75b-ced6a57da5ad/data_level0.bin differ diff --git a/e2e_test_output/kg_vdb/9a19ee1d-d66c-4220-b75b-ced6a57da5ad/header.bin b/e2e_test_output/kg_vdb/9a19ee1d-d66c-4220-b75b-ced6a57da5ad/header.bin new file mode 100644 index 0000000..b4a33c1 Binary files /dev/null and b/e2e_test_output/kg_vdb/9a19ee1d-d66c-4220-b75b-ced6a57da5ad/header.bin differ diff --git a/e2e_test_output/kg_vdb/9a19ee1d-d66c-4220-b75b-ced6a57da5ad/length.bin b/e2e_test_output/kg_vdb/9a19ee1d-d66c-4220-b75b-ced6a57da5ad/length.bin new file mode 100644 index 0000000..cb3e162 Binary files /dev/null and b/e2e_test_output/kg_vdb/9a19ee1d-d66c-4220-b75b-ced6a57da5ad/length.bin differ diff --git a/e2e_test_output/kg_vdb/9a19ee1d-d66c-4220-b75b-ced6a57da5ad/link_lists.bin b/e2e_test_output/kg_vdb/9a19ee1d-d66c-4220-b75b-ced6a57da5ad/link_lists.bin new file mode 100644 index 0000000..e69de29 diff --git a/e2e_test_output/kg_vdb/chroma.sqlite3 b/e2e_test_output/kg_vdb/chroma.sqlite3 new file mode 100644 index 0000000..0f0dd6c Binary files /dev/null and b/e2e_test_output/kg_vdb/chroma.sqlite3 differ diff --git a/e2e_test_output/tree.json b/e2e_test_output/tree.json new file mode 100644 index 0000000..4e3a153 --- /dev/null +++ b/e2e_test_output/tree.json @@ -0,0 +1,6882 @@ +{ + "nodes": [ + { + "index_id": 0, + "parent_id": null, + "type": "root", + "meta_info": { + "file_name": "BOOKRAG_VLDB_2026_full.pdf", + "file_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/BOOKRAG_VLDB_2026_full.pdf", + "page_idx": null, + "page_path": null, + "pdf_id": 0, + "pdf_para_block": null, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": null, + "title_level": -1 + }, + "summary": "" + }, + { + "index_id": 1, + "parent_id": 0, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 1, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents", + "title_level": 0 + }, + "summary": "This section introduces the BookRAG system, a novel retrieval-augmented generation framework that integrates a document-native BookIndex with an agent-based retrieval mechanism to effectively handle complex, structured enterprise documents." + }, + { + "index_id": 2, + "parent_id": 1, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 2, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Shu Wang The Chinese University of Hong Kong, Shenzhen shuwang3@link.cuhk.edu.cn Yingli Zhou The Chinese University of Hong Kong, Shenzhen yinglizhou@link.cuhk.edu.cn Yixiang Fang The Chinese University of Hong Kong, Shenzhen fangyixiang@cuhk.edu.cn As an effective method to boost the performance of Large Language Models (LLMs) on the question answering (QA) task, RetrievalAugmented Generation (RAG), which queries highly relevant information from external complex documents, has attracted tremendous attention from both industry and academia. Existing RAG approaches often focus on general documents, and they overlook the fact that many real-world documents (such as books, booklets, handbooks, etc.) have a hierarchical structure, which organizes their content from different granularity levels, leading to poor performance for the QA task. To address these limitations, we introduce BookRAG, a novel RAG approach targeted for documents with a hierarchical structure, which exploits logical hierarchies and traces entity relations to query the highly relevant information. Specifically, we build a novel index structure, called BookIndex, by extracting a hierarchical tree from the document, which serves as the role of its table of contents, using a graph to capture the intricate relationships between entities, and mapping entities to tree nodes. Leveraging the BookIndex, we then propose an agent-based query method inspired by the Information Foraging Theory, which dynamically classifies queries and employs a tailored retrieval workflow. Extensive experiments on three widely adopted benchmarks demonstrate that BookRAG achieves state-of-the-art performance, significantly outperforming baselines in both retrieval recall and QA accuracy while maintaining competitive efficiency.", + "title_level": -1 + }, + "summary": "BookRAG is a novel Retrieval-Augmented Generation (RAG) approach designed to significantly improve Large Language Model performance on question answering tasks for documents with hierarchical structures, such as books and handbooks. Unlike existing methods that treat documents as flat text, BookRAG introduces a specialized index called \"BookIndex\" that combines a hierarchical tree (acting as a table of contents) with a graph mapping entity relationships. By leveraging this structure and an agent-based query method inspired by Information Foraging Theory, BookRAG dynamically classifies queries and executes tailored retrieval workflows. Extensive experiments demonstrate that this approach achieves state-of-the-art results, outperforming baseline models in both retrieval recall and question answering accuracy while maintaining competitive efficiency." + }, + { + "index_id": 3, + "parent_id": 1, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 3, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "ABSTRACT", + "title_level": 1 + }, + "summary": "This abstract outlines the PVLDB citation standards and reproducibility policies while introducing the BookRAG hierarchical indexing method and its associated public resources." + }, + { + "index_id": 4, + "parent_id": 3, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 4, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "PVLDB Reference Format:", + "title_level": -1 + }, + "summary": "The PVLDB Reference Format is a standardized citation style specifically designed for academic papers published in the Proceedings of the Very Large Data Base (PVLDB) journal, ensuring consistent and uniform referencing of research within the database systems community." + }, + { + "index_id": 5, + "parent_id": 3, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 5, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Shu Wang, Yingli Zhou, and Yixiang Fang. BookRAG: A Hierarchical Structure-aware Index-based Approach for Retrieval-Augmented Generation on Complex Documents. PVLDB, 19(1): XXX-XXX, 2025. doi:XX.XX/XXX.XX", + "title_level": -1 + }, + "summary": "BookRAG introduces a novel hierarchical, structure-aware indexing method designed to enhance Retrieval-Augmented Generation (RAG) for complex documents by leveraging their inherent structural organization to improve retrieval accuracy and generation quality." + }, + { + "index_id": 6, + "parent_id": 3, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 6, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "PVLDB Artifact Availability:", + "title_level": -1 + }, + "summary": "The PVLDB (Proceedings of the VLDB Endowment) enforces a policy requiring authors to make their research artifacts available to ensure reproducibility and facilitate further study." + }, + { + "index_id": 7, + "parent_id": 3, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 7, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "The source code, data, and/or other artifacts have been made available at https://github.com/sam234990/BookRAG.", + "title_level": -1 + }, + "summary": "The source code, data, and related artifacts for the BookRAG project are publicly available at https://github.com/sam234990/BookRAG." + }, + { + "index_id": 8, + "parent_id": 1, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 8, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "1 INTRODUCTION", + "title_level": 1 + }, + "summary": "This section introduces the BookRAG system, a novel retrieval-augmented generation framework that integrates a document-native BookIndex with an agent-based retrieval mechanism to effectively handle complex, structured enterprise documents." + }, + { + "index_id": 9, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 9, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Large Language Models (LLMs) such as Qwen 3 [60] and Gemini 2.5 [13] have revolutionized the Question Answering (QA) system [15, 61, 65]. The industry has increasingly adopted LLMs to build QA systems that assist users and reduce manual effort in", + "title_level": -1 + }, + "summary": "Large Language Models (LLMs) like Qwen 3 and Gemini 2.5 have revolutionized Question Answering (QA) systems, driving widespread industry adoption to enhance user assistance and significantly reduce manual effort." + }, + { + "index_id": 10, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 10, + "pdf_para_block": { + "docling_label": "footnote" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "This work is licensed under the Creative Commons BY-NC-ND 4.0 International License. Visit https://creativecommons.org/licenses/by-nc-nd/4.0/ to view a copy of this license. For any use beyond those covered by this license, obtain permission by emailing info@vldb.org. Copyright is held by the owner/author(s). Publication rights licensed to the VLDB Endowment.", + "title_level": -1 + }, + "summary": "This work is licensed under the Creative Commons BY-NC-ND 4.0 International License, meaning it can be shared with attribution but cannot be used commercially or modified; any uses beyond these terms require permission from info@vldb.org, as copyright is held by the authors and publication rights are licensed to the VLDB Endowment." + }, + { + "index_id": 11, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 11, + "pdf_para_block": { + "docling_label": "footnote" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Proceedings of the VLDB Endowment, Vol. 19, No. 1 ISSN 2150-8097. doi:XX.XX/XXX.XX", + "title_level": -1 + }, + "summary": "The provided text is a bibliographic citation for Volume 19, Issue 1 of the *Proceedings of the VLDB Endowment* (ISSN 2150-8097), rather than a research article containing substantive findings, data, or arguments to summarize. Consequently, no core conclusion regarding a specific study or topic can be extracted, as the content consists solely of publication metadata." + }, + { + "index_id": 12, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 12, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 1: Comparison of existing methods and BookRAG for complex document QA.", + "title_level": -1 + }, + "summary": "BookRAG outperforms existing methods in handling complex document question-answering tasks, as demonstrated by the comparative analysis in Figure 1." + }, + { + "index_id": 13, + "parent_id": 8, + "type": "NodeType.IMAGE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 13, + "pdf_para_block": { + "docling_label": "picture" + }, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-1.png", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/14'", + "footnote": "", + "table_body": null, + "content": "cref='#/texts/14'", + "title_level": -1 + }, + "summary": "The provided content consists solely of a placeholder indicating an image with a reference caption, but it contains no actual text, data, or visual information to summarize." + }, + { + "index_id": 14, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 14, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "many applications [65, 67], such as financial auditing [29, 37], legal compliance [8], and scientific discovery [56]. However, directly relying on LLMs may lead to missing domain knowledge and generating outdated or unsupported information. To address these issues, Retrieval-Augmented Generation (RAG) has been widely adopted [17, 22] by retrieving relevant domain knowledge from external sources and using it to guide the LLM during response generation. On the other hand, in real-world enterprise scenarios, domain knowledge is often stored in long-form documents, such as technical handbooks, API reference manuals, and operational guidebooks [49]. A notable feature of such documents is that they follow the structure of books, characterized by intricate layouts and rigorous logical hierarchies (e.g., explicit tables of contents, nested chapters, and multi-level sections). In this paper, we aim to design an effective RAG system for QA over long and highly structured documents.", + "title_level": -1 + }, + "summary": "While Retrieval-Augmented Generation (RAG) effectively mitigates the limitations of Large Language Models (LLMs) in domains like finance and law by incorporating external knowledge, existing approaches often struggle with the unique challenges of enterprise documents. These documents, such as technical handbooks and API manuals, are characterized by long-form content and complex, book-like structures with rigorous logical hierarchies. Consequently, this paper proposes the design of a specialized RAG system optimized specifically for question answering over such long and highly structured documents." + }, + { + "index_id": 15, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 0, + "page_path": null, + "pdf_id": 15, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Prior works. The existing RAG approaches for documentlevel QA generally fall into two paradigms, as illustrated in Figure 1. The first paradigm relies on OCR (Optical Character Recognition) to convert the document into plain text, after which any text-based RAG method can be directly applied. Among text-based RAG methods, state-of-the-art approaches increasingly adopt graph-based RAG [6, 62, 66], where graph data serves as an external knowledge source because it captures rich semantic information and the relational structure between entities. As shown in Table 1, two representative methods are GraphRAG [16] and RAPTOR [45]. Specifically, GraphRAG first constructs a knowledge graph (KG) from the textual corpus, and then applies the Leiden community detection algorithm [51] to obtain hierarchical clusters. Summaries are generated for each community, providing a comprehensive, global overview of the entire corpus. RAPTOR builds a recursive tree structure by iteratively clustering document chunks and summarizing them at each level, enabling the model to capture both fine-grained and high-level semantic information across the corpus.", + "title_level": -1 + }, + "summary": "Existing document-level RAG approaches primarily follow two paradigms: converting documents to text via OCR for standard processing, or leveraging graph-based methods to capture rich semantic and relational structures. State-of-the-art graph-based techniques, such as GraphRAG and RAPTOR, enhance retrieval by constructing knowledge graphs or recursive tree structures; GraphRAG utilizes community detection to generate hierarchical summaries for a global corpus overview, while RAPTOR employs iterative clustering to capture both fine-grained details and high-level semantics across document chunks." + }, + { + "index_id": 16, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 16, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Table 1: Comparison of representative methods and our BookRAG.", + "title_level": -1 + }, + "summary": "BookRAG outperforms representative existing methods across the evaluated metrics, as demonstrated in the comparative analysis of Table 1." + }, + { + "index_id": 17, + "parent_id": 8, + "type": "NodeType.TABLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 17, + "pdf_para_block": { + "docling_label": "table" + }, + "img_path": "", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/17'", + "footnote": "", + "table_body": "| Type | Representative Method | Core Feature | Multi-hop Reasoning | Document Parsing | Query Workflow |\n|------------------|---------------------------|--------------------------------------------------------------|-----------------------|--------------------|------------------|\n| Graph-based | RAPTOR [45] GraphRAG [16] | Recursive summarization | | | Static |\n| Layout segmented | MM-Vanilla | Global community detection | | | Static |\n| Layout segmented | DocETL [47] | Multi-modal retrieval LLM-based document processing pipeline | | | Static Manual |\n| Doc-Native | BookRAG (Ours) | Structure-award Index & Agent-based retrieval | | | Dynamic |", + "content": "cref='#/texts/17'", + "title_level": -1 + }, + "summary": "BookRAG emerges as the most advanced retrieval method in the comparison, distinguished by its Doc-Native architecture, structure-aware indexing, and agent-based retrieval capabilities that enable dynamic query workflows. In contrast, existing approaches like RAPTOR, GraphRAG, MM-Vanilla, and DocETL rely on Graph-based or Layout-segmented strategies with static workflows, lacking the adaptive processing and structural integration offered by BookRAG." + }, + { + "index_id": 18, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 18, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "In contrast, the second paradigm, layout-aware segmentation [5, 52], first parses the document into structured blocks that preserve the original layout and information of the document, such as paragraphs, tables, figures, or equations. By doing so, it not only avoids the fixed chunk size used in the first paradigm, which often leads to fragmented information, but also retains document-native structural information. These blocks often exhibit multimodal characteristics, and a typical approach is to apply multimodal retrieval to obtain relevant content for answering queries. Recently, a state-ofthe-art method in this category, DocETL [47], provides a declarative interface that allows users to manually define LLM-based processing pipelines to analyze the retrieved blocks. These pipelines consist of LLM-powered operations combined with task-specific optimizations.", + "title_level": -1 + }, + "summary": "The layout-aware segmentation paradigm improves document analysis by parsing text into structured blocks (e.g., paragraphs, tables, figures) that preserve original formatting and avoid the information fragmentation caused by fixed-size chunking. This approach leverages the multimodal nature of these blocks for retrieval and enables advanced processing through declarative interfaces, such as DocETL, which allow users to construct custom LLM-based pipelines with task-specific optimizations." + }, + { + "index_id": 19, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 19, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Limitations of existing works. However, these methods suffer from two fundamental limitations ( L for short): L1: Failure to capture the deep connection of document structure and semantics. Text-based approaches cannot capture the structural layout of the document, resulting in the loss of important relationships stored in the hierarchical blocks, such as tables nested within a specific section. While layout-segmented methods preserve document structure, they cannot capture the relationships between different blocks in the document, which limits their capability for multi-hop reasoning across these blocks and ultimately affects their overall performance. L2: Static of query workflows. In real-world QA scenarios, user queries are highly heterogeneous, ranging from simple keyword lookups to complex multi-hop questions that require synthesizing evidence scattered across different parts of the document. Applying a uniform strategy, such as static or manually predefined workflows, to diverse needs is inefficient; for example, complex queries often require question decomposition, whereas simple queries do not.", + "title_level": -1 + }, + "summary": "Existing document-based methods face two fundamental limitations: first, they fail to capture the deep connection between document structure and semantics, as text-only approaches lose hierarchical relationships while layout-based methods miss inter-block connections needed for multi-hop reasoning; second, they rely on static query workflows that are inefficient for heterogeneous real-world queries, as a uniform strategy cannot adapt to the varying complexity of tasks ranging from simple lookups to complex evidence synthesis." + }, + { + "index_id": 20, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 20, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Our technical contributions. To bridge this gap, we introduce BookRAG , the first retrieval-augmented generation method built upon a document-native BookIndex , designed to document QA tasks. Specifically, to capture the deep connection of the relation in the document, BookIndex organizes information through two complementary structures. First, to preserve the document's native logical hierarchy, we organize the parsed content blocks into a hierarchical tree structure, which serves as the role of its table of contents. Second, to capture the intricate relations within these blocks, we construct a KG containing fine-grained entities. Finally, we unify these two structures by mapping the KG entities to their corresponding tree nodes.", + "title_level": -1 + }, + "summary": "BookRAG is the first retrieval-augmented generation method designed for document-based question answering, utilizing a novel document-native BookIndex to capture deep relational connections. This index unifies two complementary structures: a hierarchical tree that preserves the document's native logical organization (acting as a table of contents) and a fine-grained knowledge graph that maps intricate entity relations, with the two linked by mapping graph entities to their corresponding tree nodes." + }, + { + "index_id": 21, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 21, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "However, effective multi-hop reasoning on the graph relies on a high-quality KG [62, 66], which is often compromised by entity ambiguity (e.g., distinct entities with names like 'LLM' and 'Large Language Model'). To address this, we propose a novel gradient-based entity resolution method that analyzes the similarity distribution of candidate entities. By identifying sharp drops in similarity scores, we can efficiently distinguish and merge coreferent entities, thereby ensuring graph connectivity and enhancing reasoning capabilities.", + "title_level": -1 + }, + "summary": "To overcome entity ambiguity that compromises knowledge graph quality and multi-hop reasoning, a novel gradient-based entity resolution method is proposed. This approach analyzes the similarity distribution of candidate entities to identify sharp drops in scores, enabling the efficient distinction and merging of coreferent entities to ensure graph connectivity and enhance reasoning capabilities." + }, + { + "index_id": 22, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 22, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Building upon the BookIndex, we address the static of query workflows ( L2 ) by implementing an agent-based retrieval . Specifically, our agent first classifies user queries based on their intent and complexity, and then dynamically generates tailored retrieval workflows. Grounded in Information Foraging Theory [42], our retrieval process mimics foraging by using Selector to narrow down the search space via information scents and Reasoner to locate highly relevant evidence.", + "title_level": -1 + }, + "summary": "Building on the BookIndex, the proposed agent-based retrieval system addresses static query workflows by dynamically generating tailored processes based on query intent and complexity. Grounded in Information Foraging Theory, this approach mimics natural foraging behaviors: a Selector narrows the search space using information scents, while a Reasoner locates highly relevant evidence to ensure efficient and precise results." + }, + { + "index_id": 23, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 23, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "We conduct extensive experiments on three widely adopted datasets to validate the effectiveness and efficiency of our BookRAG, comparing it against several state-of-the-art baselines. The experimental results demonstrate that BookRAG consistently achieves superior performance in both retrieval recall and QA accuracy across all datasets. Furthermore, our detailed analysis validates the critical contributions of our key features, such as the high-quality KG and the agent-based retrieval mechanism.", + "title_level": -1 + }, + "summary": "BookRAG consistently outperforms state-of-the-art baselines in retrieval recall and question-answering accuracy across three widely adopted datasets, with detailed analysis confirming that its high-quality knowledge graph and agent-based retrieval mechanism are the primary drivers of this superior performance." + }, + { + "index_id": 24, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 24, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "We summarize our contributions as:", + "title_level": -1 + }, + "summary": "The provided text is incomplete and contains no substantive information to summarize, as it only states an intention to summarize contributions without listing them." + }, + { + "index_id": 25, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 25, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· We introduce BookRAG , a novel method that constructs a document-native BookIndex by integrating a hierarchical tree of document layout blocks with a KG storing finegrained entity relations.", + "title_level": -1 + }, + "summary": "BookRAG is a novel method that constructs a document-native BookIndex by integrating a hierarchical tree of document layout blocks with a knowledge graph storing fine-grained entity relations." + }, + { + "index_id": 26, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 26, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· We propose an Agent-based Retrieval approach inspired by Information Foraging Theory, which dynamically classifies queries and configures optimal retrieval workflows to locate highly relevant evidence within documents.", + "title_level": -1 + }, + "summary": "We propose an Agent-based Retrieval approach inspired by Information Foraging Theory that dynamically classifies queries and configures optimal retrieval workflows to efficiently locate highly relevant evidence within documents." + }, + { + "index_id": 27, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 27, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Extensive experiments on multiple benchmarks show that BookRAG significantly outperforms existing baselines, attaining state-of-the-art performance in solving complex document QA tasks while maintaining competitive efficiency.", + "title_level": -1 + }, + "summary": "BookRAG achieves state-of-the-art performance on complex document question-answering tasks across multiple benchmarks, significantly outperforming existing baselines while maintaining competitive efficiency." + }, + { + "index_id": 28, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 1, + "page_path": null, + "pdf_id": 28, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "2", + "title_level": -1 + }, + "summary": "The provided input contains only the number \"2\" and lacks sufficient context, text, or data to generate a meaningful summary." + }, + { + "index_id": 29, + "parent_id": 8, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 29, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Outline. We review related work in Section 2. Section 3 introduces the problem formulation, IFT, and RAG workflow. In Section 4, we present the structure of our BookIndex and its construction. Section 5 presents our agent-based retrieval, elaborating on the query classification and operators used in the structured execution of BookRAG. We present the experimental results and detailed analysis in Section 6, and conclude the paper in Section 7.", + "title_level": -1 + }, + "summary": "This paper presents a structured BookRAG system that integrates In-Context Fine-Tuning (IFT) and Retrieval-Augmented Generation (RAG) to enhance book-based information retrieval. The proposed approach features a novel BookIndex structure and an agent-based retrieval mechanism that utilizes query classification and structured operators for efficient execution. The study validates the system's effectiveness through comprehensive experimental results and analysis, following a logical progression from related work and problem formulation to detailed methodology and conclusions." + }, + { + "index_id": 30, + "parent_id": 1, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 30, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "2 RELATED WORK", + "title_level": 1 + }, + "summary": "This section reviews recent advancements in Large Language Models for document analysis and Retrieval-Augmented Generation approaches, highlighting their applications in converting unstructured data, enhancing semantic querying, and mitigating hallucinations through graph integration and agentic frameworks." + }, + { + "index_id": 31, + "parent_id": 30, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 31, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "In this section, we review the related works, including LLM in document analysis and the modern representative RAG approaches.", + "title_level": -1 + }, + "summary": "This section reviews related works focusing on the application of Large Language Models (LLMs) in document analysis and highlights modern representative Retrieval-Augmented Generation (RAG) approaches." + }, + { + "index_id": 32, + "parent_id": 30, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 32, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· LLM in document analysis. Recent advances in LLMs have offered opportunities to leverage LLMs in document data analysis. Due to the robust semantic reasoning capabilities of LLMs, there is an increasing number of works focusing on transferring unstructured documents (e.g., HTML, PDFs, and raw text) into structured formats, such as relational tables [1, 7, 25, 38]. For example, Evaporate [1] utilizes LLMs to synthesize extraction code, enabling cost-effective conversion of semi-structured web documents into structured databases without heavy manual annotation. In addition, several LLM-based document analysis systems have been proposed to equip standard data pipelines with semantic understanding [28, 40, 47, 53]. For instance, LOTUS [40] extends the relational model with semantic operators, allowing users to execute SQL-like queries with LLM-powered predicates (e.g., filter, join) over unstructured text corpora. Similarly, DocETL [47] introduces an agentic framework to optimize complex information extraction tasks. Furthermore, another line of research proposes to directly analyze or parse documents by viewing the document pages as images, thereby preserving critical layout and visual information [26, 31, 54].", + "title_level": -1 + }, + "summary": "Recent advances in Large Language Models (LLMs) are transforming document analysis by enabling the conversion of unstructured data (such as HTML, PDFs, and raw text) into structured formats and enhancing standard data pipelines with semantic understanding. Key applications include synthesizing extraction code for cost-effective database conversion (e.g., Evaporate), extending relational models with LLM-powered semantic operators for SQL-like querying over text (e.g., LOTUS), and utilizing agentic frameworks to optimize complex information extraction tasks (e.g., DocETL). Additionally, emerging research treats document pages as images to preserve critical layout and visual information during parsing." + }, + { + "index_id": 33, + "parent_id": 30, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 33, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· RAG approaches. RAG has been proven to excel in many tasks, including open-ended question answering [24, 48], programming context [9, 10], SQL rewrite [30, 50], and data cleaning [35, 36, 43]. The naive RAG technique relies on retrieving query-relevant contexts from external knowledge bases to mitigate the 'hallucination' of LLMs. Recently, many RAG approaches [16, 18, 19, 21, 27, 32, 32, 45, 55, 58, 66] have adopted graph structures to organize the information and relationships within documents, achieving improved overall retrieval performance. For more details, please refer to the recent survey of graph-based RAG methods [41]. Besides, the Agentic RAG paradigm has been widely studied, employing autonomous agents to dynamically orchestrate and refine the RAG pipeline, thus significantly boosting the reasoning robustness and generation fidelity [2, 23, 59].", + "title_level": -1 + }, + "summary": "Retrieval-Augmented Generation (RAG) effectively mitigates LLM hallucinations and excels in diverse tasks like question answering, programming, and data cleaning. Recent advancements have significantly enhanced RAG by integrating graph structures to better organize document relationships and by adopting the Agentic RAG paradigm, which uses autonomous agents to dynamically refine the retrieval pipeline, thereby improving both reasoning robustness and generation fidelity." + }, + { + "index_id": 34, + "parent_id": 1, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 34, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "3 PRELIMINARIES", + "title_level": 1 + }, + "summary": "This section establishes the formal research problem for complex document question answering by integrating Information Foraging Theory and outlining a Retrieval-Augmented Generation workflow that preserves document hierarchy." + }, + { + "index_id": 35, + "parent_id": 34, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 35, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "This section formalizes the research problem of complex document QA, introduces the foundational Information Foraging Theory (IFT), and briefly reviews the general workflow of RAG systems", + "title_level": -1 + }, + "summary": "This section establishes the formal research problem for complex document question answering by integrating foundational Information Foraging Theory (IFT) and outlining the standard workflow of Retrieval-Augmented Generation (RAG) systems." + }, + { + "index_id": 36, + "parent_id": 34, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 36, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "3.1 Problem Formulation", + "title_level": 2 + }, + "summary": "This section formally defines the Question Answering task over complex, long-form documents as a mapping problem that requires simultaneously navigating sequential content and logical document hierarchies to retrieve evidence-based answers." + }, + { + "index_id": 37, + "parent_id": 36, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 37, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "We study the problem of Question Answering (QA) over complex documents, which aims to answer user queries based on long-form documents [5, 11, 33]. Formally, a document 𝐷 is represented as a sequence of 𝑁 pages, 𝐷 = { 𝑃 𝑖 } 𝑁 𝑖 = 1 . These pages collectively contain a sequence of content blocks B = { 𝑏 𝑗 } 𝑀 𝑗 = 1 , where each block 𝑏 𝑗 represents a distinct element (e.g., text segment, section header, table, or image) organized within a logical chapter hierarchy. Given a user query 𝑞 , the goal is to generate an accurate answer 𝐴 , ideally grounded in a specific set of evidence blocks 𝐸 ⊂ B . The task is formulated as developing a method S that maps the structured document and the query to the final answer: 𝐴 = S( 𝐷,𝑞 ) (1)", + "title_level": -1 + }, + "summary": "The study addresses Question Answering (QA) over complex, long-form documents by defining the task as mapping a structured document and a user query to an accurate answer grounded in specific evidence blocks. Formally, a document is modeled as a sequence of pages containing distinct content blocks (such as text, tables, or images) organized within a logical hierarchy, with the objective being to develop a method that effectively retrieves relevant evidence from this structure to answer user queries." + }, + { + "index_id": 38, + "parent_id": 36, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 38, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "3", + "title_level": -1 + }, + "summary": "The provided input consists solely of the number \"3\" and lacks sufficient context, text, or data to generate a meaningful summary or identify a core conclusion." + }, + { + "index_id": 39, + "parent_id": 36, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 39, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝐴 = S( 𝐷,𝑞 ) (1)", + "title_level": -1 + }, + "summary": "The equation $A = S(D, q)$ defines a variable $A$ as a function $S$ dependent on two parameters, $D$ and $q$." + }, + { + "index_id": 40, + "parent_id": 36, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 40, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "where S should navigate both the sequential page content and the logical hierarchy of 𝐷 to synthesize the response.", + "title_level": -1 + }, + "summary": "To synthesize an effective response, the system must simultaneously navigate the sequential flow of page content and the logical hierarchy of the document structure." + }, + { + "index_id": 41, + "parent_id": 34, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 41, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "3.2 Information Foraging Theory", + "title_level": 2 + }, + "summary": "This section introduces Information Foraging Theory as a framework explaining how users navigate digital environments by following \"information scent\" to maximize valuable information gain while minimizing effort, a process mirrored by experts solving problems in technical handbooks." + }, + { + "index_id": 42, + "parent_id": 41, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 42, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Information Foraging Theory (IFT) [42] provides a framework for understanding information access as a process analogous to animal foraging. It suggests that users follow cues, known as information scent (e.g., keywords or icons), to navigate between clusters of content, known as information patches (e.g., sections in handbooks). The goal is to maximize the rate of valuable information gain while minimizing effort, guiding the decision to either stay within a patch or seek a new one.", + "title_level": -1 + }, + "summary": "Information Foraging Theory posits that users navigate digital environments like animals foraging for food, using cues called \"information scent\" to move between content clusters known as \"information patches.\" The primary objective of this behavior is to maximize the rate of valuable information gain while minimizing effort, which drives the critical decision of whether to remain in the current patch or search for a new one." + }, + { + "index_id": 43, + "parent_id": 41, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 43, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Consider experts seeking a solution to a specific problem within a large technical handbook. They first extract key terms related to the problem, which act as information scent. This scent guides them to navigate towards one or more promising sections (the information patches). Within these patches, they analyze the diverse content to extract the precise knowledge required to formulate a final answer", + "title_level": -1 + }, + "summary": "Experts solve specific problems in large technical handbooks by first extracting key terms as \"information scent\" to navigate to relevant sections, where they then analyze diverse content to extract the precise knowledge needed to formulate a final answer." + }, + { + "index_id": 44, + "parent_id": 34, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 44, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "3.3 RAG workflow", + "title_level": 2 + }, + "summary": "This section details a Retrieval-Augmented Generation workflow that integrates retrieval structures with a document's native tree topology to preserve its logical hierarchy." + }, + { + "index_id": 45, + "parent_id": 44, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 45, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Retrieval-Augmented Generation (RAG) systems typically operate in a two-phase framework [6, 16, 41]. In the Offline Indexing phase, unstructured corpus data is organized into a structured index, which can take various forms such as vector databases or KG [66]. Subsequently, in the Online Retrieval phase, the system retrieves relevant components (e.g., text chunks or subgraphs) based on the user query 𝑞 to inform the LLM's generation. However, these general workflows often treat the index as a structure derived purely from content, potentially detaching it from the document's original logical hierarchy. In contrast, our approach seeks to deeply integrate these retrieval structures with the document's native tree topology.", + "title_level": -1 + }, + "summary": "While standard Retrieval-Augmented Generation (RAG) systems separate content organization from document structure by using generic indexes like vector databases or knowledge graphs, the proposed approach fundamentally improves this workflow by deeply integrating retrieval structures with the document's native tree topology, thereby preserving the original logical hierarchy rather than treating the index as a content-derived abstraction." + }, + { + "index_id": 46, + "parent_id": 1, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 46, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "4 BOOKINDEX", + "title_level": 1 + }, + "summary": "Section 4 formally defines the BookIndex architecture and details its two-stage construction process, which integrates hierarchical layout parsing with gradient-based entity resolution to build a unified structure-aware index for complex documents." + }, + { + "index_id": 47, + "parent_id": 46, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 2, + "page_path": null, + "pdf_id": 47, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "This section introduces our proposed BookIndex , a hierarchical structure-aware index designed to capture both the explicit logical hierarchy and the intricate entity relations within complex documents. We first formally define the structure of the BookIndex ( 𝐵 ). Subsequently, we elaborate on the sequential, two-stage construction process: (1) Tree Construction , which parses the document's layout to establish a hierarchical nodes, each categorized by type; and (2) Graph Construction , which extracts fine-grained entity knowledge from the tree nodes and refines it through a novel gradient-based entity resolution method.", + "title_level": -1 + }, + "summary": "The proposed BookIndex is a hierarchical structure-aware index that captures both the explicit logical hierarchy and intricate entity relations within complex documents through a two-stage construction process: first, a Tree Construction phase parses document layout to establish categorized hierarchical nodes, followed by a Graph Construction phase that extracts fine-grained entity knowledge from these nodes and refines it using a novel gradient-based entity resolution method." + }, + { + "index_id": 48, + "parent_id": 46, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 48, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 2: The BookIndex Construction process. This phase includes Tree Construction, derived from Layout Parsing and Section Filtering, and Graph Construction, which involves KG Construction and Gradient-based Entity Resolution.", + "title_level": -1 + }, + "summary": "Figure 2 illustrates the BookIndex Construction process, which comprises two main phases: Tree Construction, derived from layout parsing and section filtering, and Graph Construction, involving knowledge graph creation and gradient-based entity resolution." + }, + { + "index_id": 49, + "parent_id": 46, + "type": "NodeType.IMAGE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 49, + "pdf_para_block": { + "docling_label": "picture" + }, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-2.png", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/52'", + "footnote": "", + "table_body": null, + "content": "cref='#/texts/52'", + "title_level": -1 + }, + "summary": "The provided content does not contain any substantive information to summarize, as it consists solely of a placeholder indicating an image with a reference link and lacks the actual visual data, text, or table necessary to extract key points." + }, + { + "index_id": 50, + "parent_id": 46, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 50, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "4.1 Overview of BookIndex", + "title_level": 2 + }, + "summary": "This section formally defines the BookIndex architecture as a triplet integrating a hierarchical document tree, a semantic knowledge graph, and their linking mechanism to combine structured context with fine-grained entity relations for guided navigation." + }, + { + "index_id": 51, + "parent_id": 50, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 51, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "We formally define our BookIndex as a triplet 𝐵 = ( 𝑇,𝐺, 𝑀 ) . Here, 𝑇 = ( 𝑁, 𝐸 𝑇 ) represents a Tree structure where 𝑁 is the set of nodes derived from the document's explicit logical hierarchy (e.g., titles, sections, tables), and 𝐸 𝑇 denotes their nesting relationships. 𝐺 = ( 𝑉, 𝐸 𝐺 ) is a Knowledge Graph that captures fine-grained entities ( 𝑉 ) and their relations ( 𝐸 𝐺 ) scattered throughout the document. Finally, 𝑀 𝑉 : →P( 𝑁 ) is the Graph-Tree Link (GT-Link) , which links each entity in 𝑉 to the set of specific tree nodes in 𝑁 from which it was extracted. These links are crucial for capturing the intricate, cross-sectional relations within the document. The hierarchical tree nodes in 𝑇 serve as the document's native information patches , providing structured contexts for information seeking. Meanwhile, the entities and relations in 𝐺 , connected via 𝑀 , act as the rich information scent that guides navigation between and within these patches.", + "title_level": -1 + }, + "summary": "The BookIndex is formally defined as a triplet $B = (T, G, M)$ that integrates a document's logical structure with its semantic content: $T$ represents the document's hierarchical tree of nodes (e.g., titles, sections) serving as structured information patches; $G$ is a knowledge graph capturing fine-grained entities and their relations scattered throughout the text; and $M$ (the Graph-Tree Link) connects each entity in the graph to its specific source nodes in the tree. This architecture leverages the tree for structured context while using the linked graph as an \"information scent\" to guide navigation across and within the document's sections." + }, + { + "index_id": 52, + "parent_id": 50, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 52, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 2 provides an example of our BookIndex. The Tree component, positioned at the top, organizes the document into a hierarchical structure, where content blocks such as text, tables, and images serve as leaf nodes nested within section nodes. The Graph component is composed of entities and relations extracted from these nodes. The GT-Link, illustrated by the blue dotted lines, explicitly connects these entities back to their corresponding tree nodes, thereby grounding the semantic entities within the document's logical hierarchy.", + "title_level": -1 + }, + "summary": "Figure 2 illustrates a BookIndex system that integrates a hierarchical Tree component with a semantic Graph component; the Tree organizes document content (text, tables, images) into nested sections, while the Graph extracts entities and relations from these nodes. Crucially, GT-Link connections explicitly ground these semantic entities back to their specific locations within the document's logical hierarchy, ensuring a direct link between extracted meaning and structural context." + }, + { + "index_id": 53, + "parent_id": 46, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 53, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "4.2 Tree Construction", + "title_level": 2 + }, + "summary": "Section 4.2 details the initial tree construction process that converts raw documents into a structured hierarchical format through robust layout parsing and intelligent section filtering." + }, + { + "index_id": 54, + "parent_id": 53, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 54, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "The first stage transforms the raw document into a structured hierarchical tree 𝑇 . This involves two key steps: robust layout parsing and intelligent section filtering.", + "title_level": -1 + }, + "summary": "The initial processing stage converts raw documents into a structured hierarchical tree through robust layout parsing and intelligent section filtering." + }, + { + "index_id": 55, + "parent_id": 53, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 55, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "4.2.1 Layout Parsing. The Layout Parsing phase processes the input document 𝐷 (a collection of pages) using layout analysis and recognition models. This step identifies, extracts, and organizes diverse blocks (e.g., text, tables, images) from the document pages.", + "title_level": 3 + }, + "summary": "Section 4.2.1 details the layout parsing phase, which employs analysis and recognition models to identify, extract, and organize diverse document blocks such as text, tables, and images." + }, + { + "index_id": 56, + "parent_id": 55, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 56, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "The output is a sequence of primitive", + "title_level": -1 + }, + "summary": "The output consists of a sequence of primitive data elements." + }, + { + "index_id": 57, + "parent_id": 55, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 57, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "4.2.2 Section Filtering. Next, the Section Filtering phase processes this initial sequence to identify the document's logically hierarchical structure. Layout Parsing identifies blocks as Title but does not assign their hierarchical level. Therefore, we select the candidate subset B title ⊂ B (where 𝜏 𝑖 = Title ) for an LLM-based analysis. To handle extremely long documents, this analysis is performed in batches, where each batch retains a contextual window of high-level section information (with 𝑙 = 1 as the root). The LLM analyzes the content 𝑐 𝑖 and layout features 𝑓 𝑖 of the candidates to determine two key properties: their actual hierarchical level 𝑙 𝑖 ∈ { 1 2 , , ... } and final node type 𝜏 ' 𝑖 (e.g., re-classifying an erroneous Title as Text if its level is 'None'). This step is crucial for preserving the document's logical hierarchy by correcting blocks erroneously parsed as Title , such as descriptive text within images or borderless table headers.", + "title_level": -1 + }, + "summary": "The Section Filtering phase utilizes an LLM to refine document hierarchy by analyzing candidate title blocks in batches, correcting erroneous classifications (such as descriptive text or table headers misidentified as titles) and assigning accurate hierarchical levels to preserve the document's logical structure." + }, + { + "index_id": 58, + "parent_id": 55, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 58, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Finally, the definitive tree 𝑇 = ( 𝑁, 𝐸 𝑇 ) is constructed. The node set 𝑁 is composed of all blocks from the filtering and re-classification process, where each node 𝑛 ∈ 𝑁 retains its content ( 𝑐 𝑖 ) and its final node type ( 𝜏 ' 𝑖 ) (e.g., Text , Section , Table , and Image ). The edge set 𝐸 𝑇 , representing the parent-child nesting relationships, is then established. Parent-child relationships are inferred by sequentially traversing the nodes, using both the determined hierarchical levels ( 𝑙 𝑖 ) of Section nodes and the overall document order to assemble the complete tree structure.", + "title_level": -1 + }, + "summary": "The definitive tree structure is finalized by combining all filtered and re-classified blocks as nodes—preserving their content and type (e.g., Text, Section, Table, Image)—and establishing parent-child relationships through sequential traversal that utilizes Section hierarchical levels and document order to define the complete nesting hierarchy." + }, + { + "index_id": 59, + "parent_id": 55, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 59, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "As an example shown in Figure 2, the Layout Parsing phase identifies diverse blocks, typing them as Title Text Table , , , and Image . During the Section Filtering phase, the Title candidates (e.g., \"Method\", \"Experiment\", and \"MOE Layer\") are analyzed by the LLM. The blocks 'Method' and 'Experiment' (both with 'FontSize: 14') are correctly identified as Section nodes at 'Level: 2'. Conversely, the 'MOE Layer' block ('FontSize: 20'), which was erroneously tagged as Title by the parser, is re-classified by the LLM as a Text node with 'Level: None'. This correction is crucial for preserving the document's logical hierarchy. Following this process, all filtered and classified nodes are assembled into the final tree structure based on their determined levels and document order.", + "title_level": -1 + }, + "summary": "The document structure is accurately reconstructed by combining layout parsing with LLM-based correction: while the initial parsing phase identifies diverse blocks like titles and images, the subsequent section filtering phase uses an LLM to re-evaluate candidates based on context and font size, correcting errors such as misclassifying a large \"MOE Layer\" block as a section title. This ensures the final hierarchical tree correctly preserves the document's logical flow by assigning appropriate levels to nodes like \"Method\" and \"Experiment\" while demoting erroneous titles to standard text." + }, + { + "index_id": 60, + "parent_id": 55, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 3, + "page_path": null, + "pdf_id": 60, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "4", + "title_level": -1 + }, + "summary": "The provided content consists solely of the number \"4\" and lacks sufficient context, text, or data to form a meaningful summary or draw any conclusions." + }, + { + "index_id": 61, + "parent_id": 46, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 61, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "4.3 Graph Construction", + "title_level": 2 + }, + "summary": "This section details the modality-dependent methodology for constructing a knowledge graph by extracting entities and relations from tree nodes using LLMs or VLMs, while introducing a gradient-based entity resolution approach to efficiently merge fragmented conceptual entities." + }, + { + "index_id": 62, + "parent_id": 61, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 62, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Once the tree 𝑇 is established, we proceed to populate the Knowledge Graph 𝐺 by extracting and refining entities from the tree nodes.", + "title_level": -1 + }, + "summary": "After establishing the tree 𝑇, the Knowledge Graph 𝐺 is populated by extracting and refining entities from the tree's nodes." + }, + { + "index_id": 63, + "parent_id": 61, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 63, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "4.3.1 KG Construction. We iterate each node 𝑛 𝑖 ∈ 𝑁 from the previously constructed tree 𝑇 . For each node 𝑛 𝑖 , we extract a subgraph 𝑔 𝑖 = ( 𝑉 , 𝐸 𝑖 𝑅𝑖 ) based on its content 𝑐 𝑖 and final node type 𝜏 ' 𝑖 . This extraction is modality-dependent: if the node is text-only, an LLM is prompted to extract entities and relations, while for nodes containing visual elements (e.g., 𝜏 ' 𝑖 = Image ), a Vision Language Model (VLM) is employed to extract visual knowledge. Crucially, for every entity 𝑣 ∈ 𝑉 𝑖 extracted, its origin tree node 𝑛 𝑖 is recorded, which is vital for constructing the final mapping 𝑀 .", + "title_level": 3 + }, + "summary": "This section details the modality-dependent methodology for constructing a knowledge graph by extracting entities and relations from tree nodes using LLMs or VLMs while preserving structural semantics through typed entities and explicit containment relationships." + }, + { + "index_id": 64, + "parent_id": 63, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 64, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Furthermore, to preserve structural semantics for specific logical types (e.g., Table , Formula ), our process first creates a distinct, typed entity (e.g., 𝑣 table representing the table itself). The other extracted entities from the specific node's content are linked to this primary vertex. For Table nodes specifically, row and column headers are also explicitly extracted as distinct entities and linked to 𝑣 table via a 'ContainedIn' relationship.", + "title_level": -1 + }, + "summary": "To preserve structural semantics for logical types like tables and formulas, the process creates a primary typed entity (e.g., 𝑣 table) that serves as a central hub; all extracted content from the node is linked to this entity, while specific structural elements such as row and column headers are explicitly defined as distinct entities connected via a 'ContainedIn' relationship." + }, + { + "index_id": 65, + "parent_id": 61, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 65, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "4.3.2 Gradient-based Entity Resolution. As shown in the literature [62, 66], a well-constructed KG is essential for document question answering. A common challenge in the extraction process is that the same conceptual entity is often fragmented into multiple distinct entities due to abbreviations, co-references, or its varied occurrences across different document sections. This necessitates a robust Entity Resolution (ER) process, which identifies and merges these fragmented entities to refine the raw KG.", + "title_level": 3 + }, + "summary": "This section introduces a gradient-based entity resolution methodology that efficiently merges fragmented conceptual entities in a knowledge graph by leveraging incremental vector searches, reranking, and gradient thresholds to avoid the computational overhead of quadratic comparisons." + }, + { + "index_id": 66, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 66, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "However, conventional ER methods are computationally expensive. They are often designed for batch processing across multiple data sources (commonly referred to as dirty ER), aiming to ensure accurate entity resolution by finding all possible matching pairs [12]. This process typically requires finding the transitive closure of all detected matches. That is, to definitively merge multiple entities (e.g., A, B, and C) as the same concept, the system must ideally compare all possible pairs ('A-B', 'A-C', and 'B-C') to confirm their equivalence. This can lead to a quadratic ( 𝑂 𝑛 ( 2 ) ) number of pairwise comparisons, a process that becomes prohibitively slow and computationally expensive when relying on LLMs for high-accuracy judgments.", + "title_level": -1 + }, + "summary": "Conventional entity resolution (ER) methods are computationally expensive and inefficient for large-scale applications, particularly when utilizing large language models (LLMs). Designed for batch processing of dirty data, these methods require finding the transitive closure of matches by comparing all possible pairs of entities (e.g., A-B, A-C, B-C) to ensure accuracy. This approach results in a quadratic $O(n^2)$ number of comparisons, making the process prohibitively slow as data volume increases." + }, + { + "index_id": 67, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 67, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "To address this, we employ a gradient-based ER method, operating on a single document (simplified as the clean ER), which performs ER incrementally as each new entity 𝑣 𝑛 is extracted. This transforms the quadratic batch problem into a simpler, repeated lookup task: determining where the single new entity 𝑣 𝑛 fits among the already-processed entities in the database. This incremental process yields two distinct, observable scoring patterns when 𝑣 𝑛 is reranked against its 𝑡𝑜𝑝 _ 𝑘 most relevant candidates:", + "title_level": -1 + }, + "summary": "The proposed gradient-based Entity Resolution (ER) method processes documents incrementally by treating each new entity as a single lookup against existing database entries, thereby converting a computationally expensive quadratic batch problem into a series of simpler repeated lookups that generate two distinct scoring patterns when the new entity is reranked against its top-k candidates." + }, + { + "index_id": 68, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 68, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "5", + "title_level": -1 + }, + "summary": "The provided content consists solely of the number \"5\" and lacks sufficient context, text, or data to form a meaningful summary or draw any conclusions." + }, + { + "index_id": 69, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 69, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Algorithm 1: Gradient-based entity resolution", + "title_level": -1 + }, + "summary": "Algorithm 1 outlines a gradient-based approach for entity resolution, utilizing gradient optimization techniques to iteratively refine the matching of entity records and improve resolution accuracy." + }, + { + "index_id": 70, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 70, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Input: KG 𝐺 , New entity 𝑣 𝑛 , Rerank model R , Entity vector database 𝐷𝐵 , Vector search number 𝑡𝑜𝑝 _ 𝑘 , threshold of gradient 𝑔", + "title_level": -1 + }, + "summary": "The process leverages a Knowledge Graph (KG), a new entity, a reranking model, and a vector database to identify the most relevant existing entities by performing a top-k vector search and applying a gradient threshold to filter results." + }, + { + "index_id": 71, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 71, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "// Vector Search 𝑡𝑜𝑝 _ 𝑘 relevant entities in 𝐷𝐵 . 1 𝐸 𝑐 ← Search( 𝐷𝐵, 𝑣 𝑛 , 𝑡𝑜𝑝 _ 𝑘 ); 2 S ← R( 𝐸 , 𝑣 𝑐 𝑛 ) ; // Sort all candidate entities by rerank scores. 3 Sort( 𝐸 , 𝑐 S ); 4 𝑠𝑐𝑜𝑟𝑒 ← S[ ] 0 , 𝑆𝑒𝑙 ← 𝐸 𝑐 [ 0 ; ] // Gradient select similar entities. 5 for each remain entity", + "title_level": -1 + }, + "summary": "The algorithm identifies the top-k most relevant entities from a database using vector search, then refines these candidates by sorting them based on rerank scores to select the most similar entities, ultimately using a gradient-based selection process to filter the remaining results." + }, + { + "index_id": 72, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 72, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Case A: New Entity. If 𝑣 𝑛 is a new conceptual entity, its relevance scores against all existing entities will be uniformly low, showing no significant gradient or discriminative pattern.", + "title_level": -1 + }, + "summary": "When a new conceptual entity ($v_n$) is introduced, its relevance scores against all existing entities remain uniformly low, exhibiting no significant gradient or discriminative pattern." + }, + { + "index_id": 73, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 73, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Case B: Existing Entity. If 𝑣 𝑛 is an alias of an existing entity, its scores will show a high relevance to the true match (or a small set of equivalent aliases). Due to the reranker's inherent discriminative limitations, this initial high-relevance set might occasionally contain multiple similar entities. This high-relevance set is then typically followed by a sharp decline (a large 'gradient') before transitioning to a gradual slope of irrelevant entities.", + "title_level": -1 + }, + "summary": "When a query matches an existing entity alias, the reranker initially identifies a high-relevance cluster of equivalent aliases, which is typically followed by a sharp drop in scores before transitioning to a gradual decline of irrelevant results." + }, + { + "index_id": 74, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 74, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Our Gradient-based ER algorithm is designed precisely to detect this sharp decline (characteristic of Case B), allowing us to efficiently isolate the high-relevance set. Subsequently, an LLM is utilized for finer-grained distinction when multiple similar entities are identified within this set, differentiating it from the 'no gradient' scenario (Case A) without quadratic comparisons.", + "title_level": -1 + }, + "summary": "The proposed Gradient-based ER algorithm efficiently isolates high-relevance entities by detecting sharp declines (Case B) and employs an LLM for fine-grained distinction among similar entities, thereby avoiding the computational overhead of quadratic comparisons required in 'no gradient' scenarios (Case A)." + }, + { + "index_id": 75, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 4, + "page_path": null, + "pdf_id": 75, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Algorithm 1 shows the above entity resolution process. For a new entity 𝑣 𝑛 , we first retrieve its 𝑡𝑜𝑝 _ 𝑘 candidates 𝐸 𝑐 from the vector database 𝐷𝐵 , which are then reranked by R against 𝑣 𝑛 and sorted based on their scores S (Lines 1-3). We initialize the selection set 𝑆𝑒𝑙 with the top-scoring candidate 𝐸 𝑐 [ 0 ] and set the initial score to S[ ] 0 (Line 4). We then iterate through the remaining sorted candidates (Lines 5-8). The core logic checks if the current score S[ 𝑣 𝑐 ] is still within the gradient threshold 𝑔 of the previous score (i.e., S[ 𝑣 𝑐 ] > score / 𝑔 ). If the score drop is gentle (passes the check), the candidate 𝑣 𝑐 is added to 𝑆𝑒𝑙 , and score is updated (Lines 7-8); otherwise, the loop breaks (Line 8) as soon as a sharp score drop is detected. Finally, the algorithm makes its decision (Lines 9-14). If the selection set 𝑆𝑒𝑙 is identical to 𝐸 𝑐 , this indicates that all candidates passed the gradient check. This corresponds to Case A , where the scores lacked discriminative power (i.e., 𝑣 𝑛 is equally dissimilar to all candidates), so 𝑣 𝑛 is added as a new entity (Line 9-10). Conversely, if a gradient was found (i.e., 𝑙𝑒𝑛𝑔𝑡ℎ 𝑆𝑒𝑙 ( ) < 𝑙𝑒𝑛𝑔𝑡ℎ ( 𝐸 𝑐 ) ), this signals Case B . We then select the canonical entity 𝑣 𝑠𝑒𝑙 from 𝑆𝑒𝑙 , using an LLM (Line 13) if the reranker identifies multiple aliases, and merge 𝑣 𝑛 with it (Lines 12-14). The updated 𝐺 and 𝐷𝐵 are then returned (Line 15).", + "title_level": -1 + }, + "summary": "Algorithm 1 resolves new entities by retrieving top-k candidates from a vector database, reranking them, and applying a gradient threshold to determine the outcome. It iterates through sorted candidates, adding them to a selection set only if the score drop from the previous candidate remains within a defined gradient threshold; the process stops immediately upon detecting a sharp score drop. If all candidates pass this check (Case A), the new entity is treated as unique and added to the database. Conversely, if a sharp drop occurs (Case B), the algorithm selects a canonical entity from the valid selection set—potentially using an LLM to resolve multiple aliases—and merges the new entity with it, updating the database accordingly." + }, + { + "index_id": 76, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 76, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "For instance, considering the example in Figure 2, when the new entity 𝑒 9 is processed, it is first compared with existing entities in the KG. As depicted in the similarity curve (orange line), 𝑒 9 shows high similarity with 𝑒 7, followed by a sharp decline in similarity with other entities like 𝑒 6, 𝑒 8, and 𝑒 5. Our gradient-based selection process identifies 𝑒 7 as the unique, high-confidence match for 𝑒 9. Consequently, 𝑒 9 is merged with 𝑒 7, enriching the KG with consolidated information as shown in the final merged entity 𝑒 ' 7 .", + "title_level": -1 + }, + "summary": "When processing a new entity like 𝑒 9, a gradient-based selection process identifies a unique, high-confidence match (𝑒 7) by analyzing similarity curves, where 𝑒 9 exhibits high similarity with 𝑒 7 followed by a sharp decline with other entities. This match triggers a merge operation, consolidating 𝑒 9 and 𝑒 7 into a single enriched entity (𝑒 ' 7) to enhance the knowledge graph." + }, + { + "index_id": 77, + "parent_id": 65, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 77, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Graph-Tree Link (GT-Link). The GT-Link 𝑀 is formalized to complete the BookIndex 𝐵 = ( 𝑇,𝐺, 𝑀 ) . As described in the KG Construction phase, the origin tree node 𝑛 𝑖 is recorded for every newly extracted entity 𝑣 𝑖 . GT-Link is then refined during entity resolution: when an entity 𝑣 𝑛 is merged into a canonical entity 𝑣 𝑠𝑒𝑙 , the origin node set of 𝑣 𝑠𝑒𝑙 is updated to include all origin nodes previously associated with 𝑣 𝑛 . This aggregation process creates the final mapping 𝑀 : 𝑉 → P( 𝑁 ) , which bi-directionally links the entities in 𝐺 to the set of their structural locations (nodes) in 𝑇 .", + "title_level": -1 + }, + "summary": "The Graph-Tree Link (GT-Link) is a bi-directional mapping ($M: V \\to \\mathcal{P}(N)$) that connects entities in a knowledge graph to their specific structural locations within an origin tree. This link is established by recording the source tree node for each extracted entity and is refined during entity resolution: when multiple entities are merged into a canonical entity, their respective origin nodes are aggregated. This process ensures the final mapping accurately reflects all structural positions associated with each canonical entity, completing the BookIndex structure." + }, + { + "index_id": 78, + "parent_id": 1, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 78, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "5 AGENT-BASED RETRIEVAL", + "title_level": 1 + }, + "summary": "Section 5 details BookRAG's agent-based retrieval framework, which integrates dynamic planning and structured execution to intelligently orchestrate multi-hop reasoning and information foraging for complex document queries." + }, + { + "index_id": 79, + "parent_id": 78, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 79, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Real-world document queries are often complex, necessitating operations like modal type filtering, semantic selection, and multi-hop reasoning. To address this, we propose an agent-based approach in BookRAG, which intelligently plans and executes operations on the BookIndex. We first introduce the overall workflow and present two core mechanisms: Agent-based Planning , which formulates the strategy, and the Structured Execution , which includes the retrieval process under the principles of IFT and generation.", + "title_level": -1 + }, + "summary": "BookRAG addresses the complexity of real-world document queries by introducing an agent-based approach that intelligently plans and executes operations on a BookIndex. This system relies on two core mechanisms: Agent-based Planning, which formulates the necessary strategies for tasks like modal filtering and multi-hop reasoning, and Structured Execution, which carries out retrieval and generation processes based on IFT principles." + }, + { + "index_id": 80, + "parent_id": 78, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 80, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "5.1 Overall Workflow", + "title_level": 2 + }, + "summary": "This section outlines BookRAG's three-stage agent-based retrieval workflow, which integrates planning, scent-guided retrieval, and synthesis to effectively address complex user queries." + }, + { + "index_id": 81, + "parent_id": 80, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 81, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "The overall workflow of agent-based retrieval, illustrated in Figure 3, follows a three-stage pipeline designed to address users' queries systematically.", + "title_level": -1 + }, + "summary": "Agent-based retrieval operates through a systematic three-stage pipeline designed to address user queries effectively." + }, + { + "index_id": 82, + "parent_id": 80, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 82, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "1. Agent-based Planning. BookRAG first performs Classification & Plan . This stage aims to distinguish simple keyword-based queries from reasoning questions that require decomposition and analysis. For instance, a query like 'How does Transformer differ from RNNs in handling long-range dependencies?' cannot be solved by retrieving from a single keyword. Therefore, the planning stage first performs query classification . Based on this classification and a predefined set of operators designed for the BookIndex, it generates a specific operators plan that effectively guides the retrieval and generation strategies.", + "title_level": -1 + }, + "summary": "BookRAG's Agent-based Planning initiates with a Classification & Plan stage that distinguishes simple keyword queries from complex reasoning questions requiring decomposition. For intricate queries, such as comparing Transformer and RNN architectures, the system classifies the input and generates a tailored operator plan based on predefined BookIndex operators, thereby guiding effective retrieval and generation strategies rather than relying on basic keyword matching." + }, + { + "index_id": 83, + "parent_id": 80, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 83, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 3: The general workflow of agent-based retrieval in BookRAG, which contains agent-based planning, retrieval, and generation processes.", + "title_level": -1 + }, + "summary": "BookRAG's agent-based retrieval operates through a comprehensive workflow integrating three core processes: planning, retrieval, and generation, as illustrated in Figure 3." + }, + { + "index_id": 84, + "parent_id": 80, + "type": "NodeType.IMAGE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 84, + "pdf_para_block": { + "docling_label": "picture" + }, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-3.png", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/89'", + "footnote": "", + "table_body": null, + "content": "cref='#/texts/89'", + "title_level": -1 + }, + "summary": "The provided content does not contain any substantive information to summarize, as it consists solely of a placeholder indicating an image with a reference caption code and lacks any actual text, data, or descriptive details." + }, + { + "index_id": 85, + "parent_id": 80, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 85, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "2. Retrieval Process. Guided by the operator plan, the retrieval process executes Scent/Filter-based Retrieval . This stage navigates the BookIndex 𝐵 = ( 𝑇,𝐺, 𝑀 ) , either utilizing a scent-based retrieval principle (e.g., following relevant entities in 𝐺 ) to find information, or employing various filters (e.g., modal type) to refine the selection. After reasoning, BookRAG gets the retrieval set of highly relevant information blocks from the BookIndex.", + "title_level": -1 + }, + "summary": "The retrieval process executes Scent/Filter-based Retrieval guided by an operator plan to navigate the BookIndex, utilizing scent-based principles to follow relevant entities or applying filters like modal type to refine selections, ultimately yielding a set of highly relevant information blocks for reasoning." + }, + { + "index_id": 86, + "parent_id": 80, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 86, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "3. Generation Process. Finally, all retrieved information enters the generation stage for Analysis & Merging . This stage synthesizes these (often fragmented) pieces of evidence, performs final analysis, and formulates a coherent response.", + "title_level": -1 + }, + "summary": "The generation process synthesizes retrieved, often fragmented information to perform final analysis and formulate a coherent response." + }, + { + "index_id": 87, + "parent_id": 78, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 87, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "5.2 Agent-based Planning", + "title_level": 2 + }, + "summary": "Section 5.2 details BookRAG's agent-based planning mechanism, which dynamically constructs tailored retrieval pipelines by classifying queries into single-hop, multi-hop, or global aggregation categories and orchestrating a flexible sequence of Formulator, Selector, Reasoner, and Synthesizer operators to execute the optimal strategy." + }, + { + "index_id": 88, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 88, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "The planning stage is the core of BookRAG, designed to intelligently navigate our BookIndex 𝐵 = ( 𝑇,𝐺, 𝑀 ) . To support flexible retrieval, we define four types of operators: Formulator, Selector, Reasoner, and Synthesizer. These operators can be arbitrarily combined to form tailored execution pipelines, each with adjustable parameters. BookRAG dynamically configures and assembles these operators to adapt to the specific requirements of different query categories. This process involves two sequential steps: first, the agent performs", + "title_level": -1 + }, + "summary": "BookRAG's planning stage serves as its core mechanism for intelligently navigating the BookIndex by dynamically assembling flexible retrieval pipelines. It achieves this through four adaptable operators—Formulator, Selector, Reasoner, and Synthesizer—which can be arbitrarily combined and parameterized to tailor execution strategies for specific query categories." + }, + { + "index_id": 89, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 89, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Table 2: Three common query categories addressed in BookRAG.", + "title_level": -1 + }, + "summary": "BookRAG addresses three common query categories, as detailed in Table 2." + }, + { + "index_id": 90, + "parent_id": 87, + "type": "NodeType.TABLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 90, + "pdf_para_block": { + "docling_label": "table" + }, + "img_path": "", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/95'", + "footnote": "", + "table_body": "| Query Category | Description | Core Task | Example Query |\n|--------------------|-------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------|\n| Single-hop | Queries with a single, distinct information target. | Scent-based Retrieval : Retrieve content related to a specific entity or section. | What is the definition of Information Scent? |\n| Multi-hop | Queries that require synthesizing information from multiple blocks, often by decomposing into sub-problems. | Decomposing & Merging : Decompose into sub-problems, retrieve for each, and synthesize the final answer. | How does Transformer differ from RNNs in handling long-range dependencies? |\n| Global Aggregation | Queries that require filtering across the entire document and performing calculations. | Filter & Aggregation : Apply filters across the document & perform aggregation operations (e.g., Count, Sum). | How many figures related to IFT are in Section 4? |", + "content": "cref='#/texts/95'", + "title_level": -1 + }, + "summary": "The document categorizes information retrieval queries into three distinct types based on their complexity and required processing: **Single-hop** queries target a specific entity using scent-based retrieval; **Multi-hop** queries demand decomposing complex questions into sub-problems, retrieving data for each, and synthesizing the final answer; and **Global Aggregation** queries involve filtering the entire document to perform calculations like counting or summing." + }, + { + "index_id": 91, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 5, + "page_path": null, + "pdf_id": 91, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "6", + "title_level": -1 + }, + "summary": "The provided input consists solely of the number \"6\" and contains no substantive information, context, or data to summarize." + }, + { + "index_id": 92, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 92, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "(a) Operator Set", + "title_level": -1 + }, + "summary": "The provided content is limited to the heading \"Operator Set\" and contains no substantive information, data, or context to summarize; therefore, no core conclusion or key points can be extracted." + }, + { + "index_id": 93, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 93, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 4: The BookRAG Operator Library and an Execution Example from MMLongBench dataset: (a) a visual depiction of the four operator types (Formulator, Selector, Reasoner, and Synthesizer) and (b) an execution trace for a 'Single-hop' query, demonstrating the agent-based planning and step-by-step operator execution.", + "title_level": -1 + }, + "summary": "Figure 4 illustrates the BookRAG Operator Library, which utilizes four distinct agent types—Formulator, Selector, Reasoner, and Synthesizer—to execute complex queries. Through an execution trace of a 'Single-hop' query from the MMLongBench dataset, the figure demonstrates how these operators collaborate in a step-by-step, agent-based planning process to retrieve and synthesize information." + }, + { + "index_id": 94, + "parent_id": 87, + "type": "NodeType.IMAGE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 94, + "pdf_para_block": { + "docling_label": "picture" + }, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-4.png", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/98'", + "footnote": "", + "table_body": null, + "content": "cref='#/texts/98'", + "title_level": -1 + }, + "summary": "The provided content consists solely of an image with a reference caption and lacks any descriptive text, data, or context necessary to extract a meaningful summary or core conclusion." + }, + { + "index_id": 95, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 95, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Query Classification to determine the appropriate solution strategy, then generates a specific Operator Plan .", + "title_level": -1 + }, + "summary": "The process begins by classifying the query to determine the optimal solution strategy, which then guides the generation of a specific operator plan." + }, + { + "index_id": 96, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 96, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Query Classification . To enable agent strategy selection, we focus on three representative query categories defined by their intrinsic complexity and operational demands (Table 2): Single-hop , Multi-hop , and Global Aggregation . This classification is crucial because each category requires a different solution strategy. For instance, a Single-hop query typically requires a single piece of information retrieved via a Scent-based Retrieval operation. In contrast, a Global Aggregation query often necessitates analyzing content under multiple filtering conditions, usually involving a sequence of Filter & Aggregation operations across various parts of the document. Furthermore, BookRAG is designed to be extensible, allowing for the resolution of a broader range of query types by integrating additional operators.", + "title_level": -1 + }, + "summary": "BookRAG employs a query classification system—categorizing requests as Single-hop, Multi-hop, or Global Aggregation based on their complexity—to dynamically select the most effective retrieval and processing strategies, such as Scent-based Retrieval or Filter & Aggregation sequences, while maintaining extensibility to support additional query types through new operators." + }, + { + "index_id": 97, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 97, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· BookIndex Operators . To execute the strategies identified by classification, we designed a set of operators ( O ) tailored for the BookIndex 𝐵 = ( 𝑇,𝐺, 𝑀 ) . These operators, visually depicted in Figure 4(a) and detailed in Table 3, define the set of operations the agent can employ for diverse query categories. We group them into four types, which we describe in sequence:", + "title_level": -1 + }, + "summary": "To execute strategies identified by classification, a set of tailored operators (O) was designed for the BookIndex (B = T, G, M). These operators, categorized into four distinct types and detailed in Table 3, define the specific operations an agent can perform across diverse query categories." + }, + { + "index_id": 98, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 98, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "❶ Formulator. These are LLM-based operators that prepare the query for execution. This category includes Decompose , which breaks a Complex query into a set of simpler, actionable sub-queries 𝑄 𝑠 . It also includes Extract , which employs an LLM to identify key entities 𝐸 𝑞 from the query text and link them to corresponding entities in the KG, 𝐺 : 𝑄 𝑠 = LLM ( 𝑃 𝐷𝑒𝑐 , 𝑞 ) = { 𝑞 , 𝑞 1 2 , . . . , 𝑞 𝑘 } (2) 𝐸 𝑞 = LLM ( 𝑃 𝐸𝑥𝑡 , 𝑞 ) = { 𝑒 1 , 𝑒 2 , . . . , 𝑒 𝑚 } (3)", + "title_level": -1 + }, + "summary": "Formulators are LLM-based operators designed to prepare complex queries for execution by transforming them into actionable components. This process primarily involves two functions: Decompose, which breaks a complex query into a set of simpler sub-queries, and Extract, which identifies key entities within the query text and links them to corresponding entities in the knowledge graph." + }, + { + "index_id": 99, + "parent_id": 87, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 99, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑄 𝑠 = LLM ( 𝑃 𝐷𝑒𝑐 , 𝑞 ) = { 𝑞 , 𝑞 1 2 , . . . , 𝑞 𝑘 } (2)", + "title_level": -1 + }, + "summary": "The equation defines a process where a Large Language Model (LLM) takes a prompt ($P_{Dec}$) and an initial query ($q$) as inputs to generate a set of $k$ related queries ($Q_s$), starting with the original query and followed by $k-1$ derived variations." + }, + { + "index_id": 100, + "parent_id": 87, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 100, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝐸 𝑞 = LLM ( 𝑃 𝐸𝑥𝑡 , 𝑞 ) = { 𝑒 1 , 𝑒 2 , . . . , 𝑒 𝑚 } (3)", + "title_level": -1 + }, + "summary": "The equation defines a process where a Large Language Model (LLM) takes an external prompt ($P_{Ext}$) and a query ($q$) as inputs to generate a set of $m$ distinct embeddings ($e_1$ through $e_m$), representing the model's encoded response to that specific query." + }, + { + "index_id": 101, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 101, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Here, 𝑞 is the original user query, while 𝑃 𝐷𝑒𝑐 and 𝑃 𝐸𝑥𝑡 represent the prompts used to guide the LLM for the decomposition and extraction tasks, respectively.", + "title_level": -1 + }, + "summary": "In this framework, the original user query ($q$) is processed using two distinct prompts: $P_{Dec}$ to guide the Large Language Model (LLM) in decomposing the task, and $P_{Ext}$ to direct it in extracting the necessary information." + }, + { + "index_id": 102, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 102, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "❷ Selector. These operators filter or select specific content ranges from the BookIndex. Filter_Modal and Filter_Range directly apply the explicit constraints 𝐶 (e.g., modal types, page ranges) generated during the plan. Operating on the Tree 𝑇 = ( 𝑁, 𝐸 𝑇 ) , these operators produce a filtered subset 𝑁 𝑓 where the predicate 𝐶 𝑛 ( ) holds true for each node: 𝑁 𝑓 = { 𝑛 ∈ 𝑁 | 𝐶 𝑛 ( )} (4)", + "title_level": -1 + }, + "summary": "The Selector operators (Filter_Modal and Filter_Range) filter the BookIndex by applying explicit constraints to a tree structure, generating a subset of nodes that satisfy specific criteria such as modal types or page ranges." + }, + { + "index_id": 103, + "parent_id": 87, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 103, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑁 𝑓 = { 𝑛 ∈ 𝑁 | 𝐶 𝑛 ( )} (4)", + "title_level": -1 + }, + "summary": "Equation (4) defines the set $N_f$ as the collection of all natural numbers $n$ that satisfy the condition $C_n$." + }, + { + "index_id": 104, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 104, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "In contrast, Select_by_Entity and Select_by_Section target contiguous document segments by retrieving subtrees rooted at specific section nodes. This process first identifies a set of target section nodes 𝑆 target ⊂ 𝑁 at a specified depth, where 𝑆 target consists of sections either linked to entities 𝐸 𝑞 via the GT-Link 𝑀 or selected by the LLM. It then retrieves all descendants of these targets to form the selected node set 𝑁 𝑠 : 𝑁 𝑠 = GLYPH<216> 𝑠 ∈ 𝑆 target Subtree ( 𝑠 ) (5)", + "title_level": -1 + }, + "summary": "Select_by_Entity and Select_by_Section condense documents by retrieving contiguous segments rooted at specific section nodes; this process first identifies target sections (either linked to entities via GT-Link or selected by an LLM) at a defined depth and then aggregates all their descendants to form the final selected content set." + }, + { + "index_id": 105, + "parent_id": 87, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 105, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑁 𝑠 = GLYPH<216> 𝑠 ∈ 𝑆 target Subtree ( 𝑠 ) (5)", + "title_level": -1 + }, + "summary": "The equation $N_s = \\sum_{s \\in S} \\text{target Subtree}(s)$ defines the total count $N_s$ as the sum of target subtrees across all elements $s$ within the set $S$." + }, + { + "index_id": 106, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 106, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "❸ Reasoner. These operators analyze and refine selected tree nodes. Graph_Reasoning performs multi-hop inference on a subgraph 𝐺 ' ( 𝑉 , 𝐸 ' ' ) (extracted from selected nodes 𝑁 𝑠 ) starting from entity 𝑒 . Starting from the retrieved entities, it computes an entity importance vector 𝐼 𝐺 ∈ R | 𝑉 ' | over the subgraph 𝐺 ' using the PageRank algorithm [20]. These entity scores are then mapped to the tree nodes via the GT-Link matrix 𝑀 to derive the final tree node importance scores vector 𝑆 𝐺 ∈ R | 𝑁 𝑠 | : 𝐼 𝐺 = PageRank ( 𝐺 , 𝑒 ' ) (6) 𝑆 𝐺 = 𝐼 𝐺 × 𝑀 (7)", + "title_level": -1 + }, + "summary": "The Reasoner operator refines selected tree nodes by performing multi-hop inference on a subgraph extracted from those nodes. It begins at a specific entity, calculates an entity importance vector using the PageRank algorithm, and then maps these scores to the tree nodes via the GT-Link matrix to generate a final vector of tree node importance scores." + }, + { + "index_id": 107, + "parent_id": 87, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 107, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝐼 𝐺 = PageRank ( 𝐺 , 𝑒 ' ) (6)", + "title_level": -1 + }, + "summary": "Equation (6) defines the PageRank of a graph $G$ as a function dependent on the graph structure and a specific parameter $e'$, denoted as $I_G = \\text{PageRank}(G, e')$." + }, + { + "index_id": 108, + "parent_id": 87, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 108, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑆 𝐺 = 𝐼 𝐺 × 𝑀 (7)", + "title_level": -1 + }, + "summary": "The total system gain ($S_G$) is calculated by multiplying the individual gain ($I_G$) by the multiplier ($M$), as defined by the equation $S_G = I_G \\times M$." + }, + { + "index_id": 109, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 109, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Text_Ranker evaluates the semantic relevance of the tree node's content to the query 𝑞 , assigning a relevance score 𝑆 𝑇 to each node. Skyline_Ranker employs the Skyline operator to filter nodes based on these multiple criteria (e.g., 𝑆 𝐺 and 𝑆 𝑇 ), retaining only those nodes that are not dominated by any others in terms of the specified scoring dimensions.", + "title_level": -1 + }, + "summary": "The system combines Text_Ranker, which assigns semantic relevance scores to tree nodes based on a query, with Skyline_Ranker, which filters these nodes by retaining only those that are not dominated by others across multiple scoring dimensions." + }, + { + "index_id": 110, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 6, + "page_path": null, + "pdf_id": 110, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "7", + "title_level": -1 + }, + "summary": "The provided input contains only the number \"7\" and lacks sufficient context, text, or data to generate a meaningful summary." + }, + { + "index_id": 111, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 111, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "❹ Synthesizer. These operators are responsible for content generation. Map performs analysis on specific retrieved information segments to generate partial responses. Reduce synthesizes a final coherent answer by aggregating information from multiple sources, such as partial answers or a collection of retrieved evidence.", + "title_level": -1 + }, + "summary": "Synthesizer operators drive content generation by analyzing retrieved information to produce partial responses (Map) and aggregating these partial answers or evidence from multiple sources to construct a final, coherent response (Reduce)." + }, + { + "index_id": 112, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 112, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Operator Plan . After classifying the query ( 𝑞 ) into its category ( 𝑐 ), the agent's final task is to generate an executable plan 𝑃 . This plan is a specific sequence of operators ⟨ 𝑜 , . . . , 𝑜 1 𝑛 ⟩ selected from our library O with parameters dynamically instantiated based on 𝑞 . This process is formulated as: 𝑃 = Agent Plan ( 𝑞, 𝑐, O) (8)", + "title_level": -1 + }, + "summary": "The agent's final task is to generate an executable plan by selecting a specific sequence of operators from a library and dynamically instantiating their parameters based on the classified query and its category." + }, + { + "index_id": 113, + "parent_id": 87, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 113, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑃 = Agent Plan ( 𝑞, 𝑐, O) (8)", + "title_level": -1 + }, + "summary": "The agent's plan ($P$) is a function determined by the query ($q$), context ($c$), and observations ($O$)." + }, + { + "index_id": 114, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 114, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "The plan follows a structured workflow tailored to each category:", + "title_level": -1 + }, + "summary": "The plan utilizes a structured workflow that is specifically tailored to each category." + }, + { + "index_id": 115, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 115, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Single-hop : The agent first attempts to Extract an entity. If successful, it executes a 'scent-based' selection; otherwise, it falls back to a section-based strategy. Both paths then proceed to standard reasoning and generation, denoted as 𝑃 std .", + "title_level": -1 + }, + "summary": "The single-hop strategy initiates with entity extraction, employing a scent-based selection upon success or a section-based fallback if extraction fails, before both paths converge on standard reasoning and generation." + }, + { + "index_id": 116, + "parent_id": 87, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 116, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑃 s = ( Extract success - - - - -→ Select_by_Entity → 𝑃 std Extract fail - -→ Select_by_Section → 𝑃 std (9)", + "title_level": -1 + }, + "summary": "The success rate ($P_s$) is determined by a conditional selection process: if extraction succeeds, the probability is calculated by selecting based on the entity ($P_{std}$); if extraction fails, the probability is calculated by selecting based on the section ($P_{std}$)." + }, + { + "index_id": 117, + "parent_id": 87, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 117, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑃 std = ( Graph ∥ Text ) → Skyline → Reduce (10)", + "title_level": -1 + }, + "summary": "The process condenses a combined graph and text input into a final result of 10 items by first generating a skyline and then applying a reduction step." + }, + { + "index_id": 118, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 118, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Complex : The agent first decomposes the problem, applies the Single-hop workflow 𝑃 s to each sub-problem, and finally synthesizes the results.", + "title_level": -1 + }, + "summary": "The Complex workflow operates by decomposing a problem into sub-problems, applying the Single-hop workflow to each individually, and then synthesizing the results to form the final solution." + }, + { + "index_id": 119, + "parent_id": 87, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 119, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑃 complex = Decompose → 𝑃 s → Map → Reduce (11)", + "title_level": -1 + }, + "summary": "The complex probability $P_{complex}$ is derived through a four-step process: decomposition into $P_s$, followed by mapping and reduction operations." + }, + { + "index_id": 120, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 120, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Global Aggregation : The workflow involves applying a sequence of filters followed by synthesis.", + "title_level": -1 + }, + "summary": "The workflow operates through global aggregation, which applies a sequence of filters followed by synthesis." + }, + { + "index_id": 121, + "parent_id": 87, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 121, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑃 global = GLYPH<214> ( Filter_Modal | Filter_Range ) → Map → Reduce (12)", + "title_level": -1 + }, + "summary": "The global process `P` executes a two-stage pipeline where a combined filter (either modal or range-based) is applied to data, followed by a Map operation and a Reduce operation with a parallelism factor of 12." + }, + { + "index_id": 122, + "parent_id": 87, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 122, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Here, the symbol ˛ denotes the nested composition of filters, applying either a modal or range filter at each step.", + "title_level": -1 + }, + "summary": "The symbol ˛ represents the nested composition of filters, where a modal or range filter is applied sequentially at each step." + }, + { + "index_id": 123, + "parent_id": 78, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 123, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "5.3 Structured Execution", + "title_level": 2 + }, + "summary": "This section details BookRAG's structured execution methodology, which applies Information Foraging Theory through a three-stage pipeline of Selector-based retrieval, Reasoner-driven sensemaking, and Synthesizer-based answer generation to efficiently minimize computational costs." + }, + { + "index_id": 124, + "parent_id": 123, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 124, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Following the planning stage, BookRAG executes the generated workflow 𝑃 . This execution phase embodies the cognitive principles of Information Foraging Theory (IFT), effectively translating abstract textual queries into concrete operations. Specifically, the Selector operators mirror the act of 'navigating to information patches,' narrowing the vast document space down to relevant scopes. Subsequently, the Reasoner operators perform 'sensemaking within patches,' where they analyze and refine the information within these focused scopes. Finally, the Synthesizer generates the answer based on the processed evidence. This design minimizes the cost of attention by ensuring computational resources are focused solely on high-value data patches.", + "title_level": -1 + }, + "summary": "BookRAG's execution phase applies Information Foraging Theory to efficiently answer queries by minimizing computational costs through a three-step process: the Selector navigates to relevant document patches, the Reasoner performs sensemaking within those focused scopes, and the Synthesizer generates the final answer based on the processed evidence." + }, + { + "index_id": 125, + "parent_id": 123, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 125, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Scent/Filter-based Retrieval. The execution begins by narrowing the scope. Aligning with IFT, Selector operators identify relevant 'patches' by following 'information scents' (e.g., key entities in question) or applying explicit filter constraints. This process reduces the full node set 𝑁 to a focused node subset 𝑁 𝑠 : 𝑁 𝑠 = Selector ( 𝑁, params sel ) (13)", + "title_level": -1 + }, + "summary": "Scent and filter-based retrieval initiates the process by narrowing the scope of a full node set to a focused subset through the use of Selector operators. These operators identify relevant data \"patches\" by following information scents, such as key entities within a question, or by applying explicit filter constraints, thereby reducing the search space before further processing." + }, + { + "index_id": 126, + "parent_id": 123, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 126, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑁 𝑠 = Selector ( 𝑁, params sel ) (13)", + "title_level": -1 + }, + "summary": "The notation $N_s = \\text{Selector}(N, \\text{params}_{\\text{sel}})$ defines a selector operation that generates a specific subset $N_s$ from a dataset $N$ by applying a selection function configured with a set of parameters." + }, + { + "index_id": 127, + "parent_id": 123, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 127, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "This pre-selection minimizes noise and ensures that subsequent reasoning is applied only to highly relevant contexts, optimizing the foraging cost. Subsequently, within this focused scope, Reasoner operators evaluate nodes using multiple dimensions, such as graph topology and semantic relevance. We then employ the Skyline_Ranker to get the final retrieval set. Unlike fixed top𝑘 retrieval, the Skyline operator retains the Pareto frontier of nodes, retaining nodes that are valuable in at least one dimension while discarding dominated ones: 𝑁 𝑅 = Skyline_Ranker ({ 𝑆 𝐺 ( 𝑛 , 𝑆 ) 𝑇 ( 𝑛 ) | 𝑛 ∈ 𝑁 𝑠 }) (14)", + "title_level": -1 + }, + "summary": "The proposed retrieval method optimizes efficiency by first pre-selecting highly relevant contexts to minimize noise, then applying multi-dimensional evaluation (graph topology and semantic relevance) via Reasoner operators. Finally, it utilizes a Skyline_Ranker to generate the final retrieval set by preserving the Pareto frontier of nodes—keeping those valuable in at least one dimension while discarding dominated ones—thereby outperforming fixed top-k retrieval." + }, + { + "index_id": 128, + "parent_id": 123, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 128, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑁 𝑅 = Skyline_Ranker ({ 𝑆 𝐺 ( 𝑛 , 𝑆 ) 𝑇 ( 𝑛 ) | 𝑛 ∈ 𝑁 𝑠 }) (14)", + "title_level": -1 + }, + "summary": "The Skyline Ranker ($N_R$) is defined as a ranking function that processes a set of candidate nodes ($n \\in N_s$) by evaluating their skyline score $SG(n, S)$ and time cost $T(n)$ to determine the optimal selection order." + }, + { + "index_id": 129, + "parent_id": 123, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 129, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Analysis & Merging Generation. In the final stage, the Synthesizer operator generates the coherent answer by aggregating the refined evidence: 𝐴 = Synthesizer ( 𝑞, 𝑁 𝑅 ) (15)", + "title_level": -1 + }, + "summary": "In the final stage of the process, the Synthesizer operator generates a coherent answer by aggregating refined evidence, mathematically represented as $A = \\text{Synthesizer}(q, N_R)$." + }, + { + "index_id": 130, + "parent_id": 123, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 130, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝐴 = Synthesizer ( 𝑞, 𝑁 𝑅 ) (15)", + "title_level": -1 + }, + "summary": "The equation $A = \\text{Synthesizer}(q, N_R)$ defines $A$ as the output generated by a Synthesizer function that takes a query $q$ and a set of $N_R$ elements as inputs." + }, + { + "index_id": 131, + "parent_id": 123, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 131, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Table 3: Operators utilized in our BookRAG, categorized by their function.", + "title_level": -1 + }, + "summary": "BookRAG employs a diverse set of operators categorized by their specific functional roles to facilitate its operations." + }, + { + "index_id": 132, + "parent_id": 123, + "type": "NodeType.TABLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 132, + "pdf_para_block": { + "docling_label": "table" + }, + "img_path": "", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/136'", + "footnote": "", + "table_body": "| Operator | Type | Description | Parameters |\n|-------------------------------|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------|\n| Decompose | Formulator Formulator | Decompose a complex query into simpler, actionable sub-queries. Identify and extract key entities from the query (links to 𝐺 ). | (Self-contained) (Self-contained) |\n| Extract Filter_Modal | Selector | Filter retrieved nodes by their modal type (e.g., Table, Figure). | modal_type: str |\n| Filter_Range | Selector | Filter nodes based on a specified range (e.g., pages, section). | range: (start, end) |\n| Select_by_Entity | Selector | Selects all tree nodes ( 𝑁 ) in sections linked to a given entity ( 𝑉 ). | entity_name: str |\n| Select_by_Section | Selector | Uses an LLM to select relevant sections and selects all tree nodes ( 𝑁 ) within them. | query: str, sections: List[str] |\n| Graph_Reasoning | Reasoner | Performs multi-hop reasoning on subgraph ( 𝐺 ' ) and score tree nodes ( 𝑁 ) using graph importance and GT-links. Rerank retrieved tree nodes ( 𝑁 ) based on the relevance. | start_entity: str, subgraph: 𝐺 ' query: str |\n| Text_Reasoning Skyline_Ranker | Reasoner Reasoner | | criteria: List[str] |\n| | | Rerank nodes based on multiple criteria. | (Input: List[str]) |\n| Map | Synthesizer | Uses partially retrieved information to generate a partial answer. | |\n| Reduce | Synthesizer | Synthesizes the final answer from partial information or all sub-problem answers. | (Input: List[str]) |", + "content": "cref='#/texts/136'", + "title_level": -1 + }, + "summary": "The provided table defines a structured query processing framework comprising six functional categories: **Formulators** decompose complex queries into actionable sub-queries; **Selectors** filter or retrieve specific nodes based on modal types, page ranges, entities, or LLM-assisted section selection; **Reasoners** perform multi-hop graph analysis and multi-criteria ranking to score and reorder results; and **Synthesizers** generate partial answers and combine them into a final response. This pipeline systematically transforms raw queries into refined, synthesized answers through entity extraction, targeted filtering, logical reasoning, and information aggregation." + }, + { + "index_id": 133, + "parent_id": 123, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 7, + "page_path": null, + "pdf_id": 133, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "8", + "title_level": -1 + }, + "summary": "The provided content consists solely of the number \"8\" and lacks sufficient context, narrative, or data to form a meaningful summary or draw any conclusions." + }, + { + "index_id": 134, + "parent_id": 123, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 134, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "The Map operator performs fine-grained analysis on individual evidence blocks or sub-problems (from Decompose ) to generate intermediate insights. The Reduce operator then aggregates these partial results, such as answers to decomposed sub-queries or statistical counts from a global filter, to construct the final response. This separation ensures that the system can handle both detailed content extraction and high-level reasoning synthesis effectively.", + "title_level": -1 + }, + "summary": "The Map and Reduce operators function as a complementary pair to ensure effective system performance: the Map operator conducts fine-grained analysis on individual evidence blocks to generate intermediate insights, while the Reduce operator aggregates these partial results to synthesize the final response, thereby enabling the system to seamlessly handle both detailed content extraction and high-level reasoning." + }, + { + "index_id": 135, + "parent_id": 123, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 135, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "To illustrate this end-to-end process, Figure 4(b) presents an execution trace for a 'Single-hop' query: 'What is the type of car in the Ranking Prompt example?'. In the planning phase, the agent classifies the query and generates a specific workflow. Subsequently, it identifies key entities (e.g., 'car') via Extract , retrieves relevant nodes via Select_by_Entity , refines them through reasoning and Skyline filtering, and finally synthesizes the answer using Reduce .", + "title_level": -1 + }, + "summary": "Figure 4(b) illustrates an end-to-end execution trace for a single-hop query regarding car types, where an agent classifies the request to generate a workflow that extracts key entities, retrieves and refines relevant nodes through reasoning and Skyline filtering, and finally synthesizes the answer." + }, + { + "index_id": 136, + "parent_id": 1, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 136, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "6 EXPERIMENTS", + "title_level": 1 + }, + "summary": "Section 6 presents a comprehensive experimental evaluation of BookRAG, detailing the setup, benchmark results, and ablation studies that demonstrate its state-of-the-art performance in document question-answering tasks." + }, + { + "index_id": 137, + "parent_id": 136, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 137, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "In our experiments, we evaluate BookRAG against several strong baseline methods, with an in-depth comparison of their efficiency and accuracy on document QA tasks.", + "title_level": -1 + }, + "summary": "BookRAG was evaluated against strong baseline methods, demonstrating a detailed comparison of its efficiency and accuracy in document question-answering tasks." + }, + { + "index_id": 138, + "parent_id": 136, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 138, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "6.1 Setup", + "title_level": 2 + }, + "summary": "Section 6.1 outlines the experimental setup by defining the benchmark datasets and evaluation metrics, describing the baseline RAG architectures, and specifying the unified model configurations and implementation details used to ensure fair performance comparison." + }, + { + "index_id": 139, + "parent_id": 138, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 139, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Table 4: Datasets used in our experiments. EM and F1 denote Exact Match and F1-score, respectively.", + "title_level": -1 + }, + "summary": "Table 4 lists the datasets employed in the experiments, utilizing Exact Match (EM) and F1-score as the primary evaluation metrics." + }, + { + "index_id": 140, + "parent_id": 138, + "type": "NodeType.TABLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 140, + "pdf_para_block": { + "docling_label": "table" + }, + "img_path": "", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/143'", + "footnote": "", + "table_body": "| Dataset | MMLongBench | M3DocVQA | Qasper |\n|-------------|---------------|------------|--------------|\n| Questions | 669 | 633 | 640 |\n| Documents | 85 | 500 | 192 |\n| Avg. Pages | 42.16 | 8.52 | 10.95 |\n| Avg. Images | 25.92 | 3.51 | 3.43 |\n| Tokens | 2,816,155 | 3,553,774 | 2,265,349 |\n| Metrics | EM, F1 | EM, F1 | Accuracy, F1 |", + "content": "cref='#/texts/143'", + "title_level": -1 + }, + "summary": "The table summarizes three long-document benchmark datasets—MMLongBench, M3DocVQA, and Qasper—highlighting their distinct characteristics in scale and evaluation metrics. M3DocVQA contains the highest number of documents (500) and tokens (3.55 million) but the fewest pages per document (8.52), while MMLongBench features the longest average document length (42.16 pages) and the highest image density (25.92 images per document). All three datasets contain over 600 questions each, with MMLongBench and M3DocVQA evaluated using Exact Match (EM) and F1 scores, whereas Qasper utilizes Accuracy and F1." + }, + { + "index_id": 141, + "parent_id": 138, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 141, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Datasets & Question Synthesis. We use three widely adopted benchmarking datasets for complex document QA tasks: MMLongBench [33], M3DocVQA [11], and Qasper [14]. MMLongBench is a comprehensive benchmark designed to evaluate QA capabilities on long-form documents, covering diverse categories such as guidebooks, financial reports, and industry files. M3DocVQA is an open-domain benchmark designed to test RAG systems on a diverse collection of HTML-type documents sourced from Wikipedia pages 1 . Qasper is a QA dataset focused on scientific papers, where questions require retrieving evidence from the entire document. We filtered the datasets to remove documents with low clarity or incoherent structures. To address the scarcity of global-level questions in the original benchmarks, we synthesize additional QA pairs by having an LLM generate global questions from selected document elements (e.g., tables or figures). These questions are then answered and meticulously refined by human annotators via an outsourcing process, with this additional QA pairs constituting less than 20% of our final QA pairs. The statistics of these datasets are presented in Table 4.", + "title_level": -1 + }, + "summary": "To evaluate complex document question answering, the study utilizes three benchmark datasets—MMLongBench for long-form documents, M3DocVQA for open-domain HTML content, and Qasper for scientific papers—after filtering for clarity and synthesizing additional global-level questions via LLMs and human refinement to address data scarcity." + }, + { + "index_id": 142, + "parent_id": 138, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 142, + "pdf_para_block": { + "docling_label": "footnote" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "1 https://www.wikipedia.org/", + "title_level": -1 + }, + "summary": "Wikipedia is a free, open-source online encyclopedia that allows anyone to edit its content, serving as a vast, collaboratively maintained repository of knowledge across countless topics." + }, + { + "index_id": 143, + "parent_id": 138, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 143, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "9", + "title_level": -1 + }, + "summary": "The provided input contains only the number \"9\" and lacks sufficient context, data, or descriptive text to generate a meaningful summary or identify a core conclusion." + }, + { + "index_id": 144, + "parent_id": 138, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 144, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Metrics. Weadheretotheofficial metrics specified by each dataset for QA. Our primary evaluation relies on Exact Match (EM), accuracy, and token-based F1-score. To assess efficiency, we also measure time cost and token usage during the response phase. Additionally, for methods including PDF parsing, we also evaluate retrieval recall. To establish the ground truth for this, we manually label the specific PDF blocks (e.g., texts, titles, tables, images, and formulas) required to answer each question. This labeling process is guided by the metadata of ground-truth evidence provided in each dataset; we filter candidate blocks using the given modality (all datasets), page numbers (MMLongBench), and evidence statements (Qasper). Any blocks that remained non-unique after this filtering process are manually annotated. In cases where a PDF parsing error made the ground-truth item unavailable, the retrieval recall for that query is recorded as 0.", + "title_level": -1 + }, + "summary": "Evaluation relies on standard QA metrics (Exact Match, accuracy, and token-based F1-score) alongside efficiency measures (time cost and token usage), with retrieval recall specifically assessed for PDF parsing methods. Ground truth for recall is established by manually labeling essential PDF blocks (text, tables, images, etc.) using dataset metadata and filtering criteria, with manual annotation applied to non-unique cases; any retrieval failure due to parsing errors is recorded as a recall score of zero." + }, + { + "index_id": 145, + "parent_id": 138, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 145, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Baselines. Our experiments consider three model configurations:", + "title_level": -1 + }, + "summary": "The experiments evaluate three distinct model configurations to establish performance baselines." + }, + { + "index_id": 146, + "parent_id": 138, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 146, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Conventional RAG: These methods are the most common pipeline for document analysis, where the raw text is first extracted and then chunked into segments of a specified size. We select strong and widely used retrieval models: BM25 [44] and Vanilla RAG. We also implement Layout+Vanilla, a variant that uses document layout analysis for semantic chunking.", + "title_level": -1 + }, + "summary": "Conventional Retrieval-Augmented Generation (RAG) pipelines typically extract raw text, segment it into fixed-size chunks, and employ standard retrieval models like BM25 or Vanilla RAG, with a specialized variant known as Layout+Vanilla enhancing this process by utilizing document layout analysis for more effective semantic chunking." + }, + { + "index_id": 147, + "parent_id": 138, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 147, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Graph-based RAG: These methods first extract textual content from documents and then leverage graph data during retrieval. We select RAPTOR [45] and GraphRAG [16]. Specifically, GraphRAG has two versions: GraphRAG-Global and GraphRAG-Local, which employ global and local search methods, respectively.", + "title_level": -1 + }, + "summary": "Graph-based Retrieval-Augmented Generation (RAG) methods enhance document retrieval by first extracting text and then utilizing graph structures, with prominent examples including RAPTOR and two variants of GraphRAG: GraphRAG-Global, which employs global search, and GraphRAG-Local, which utilizes local search." + }, + { + "index_id": 148, + "parent_id": 138, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 148, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· LayoutsegmentedRAG: This category encompasses methods that utilize layout analysis to segment document content into discrete structural units. We include: MM-Vanilla, which utilizes multi-modal embeddings for visual and textual content; a tree-based method inspired by PageIndex [39], denoted as TreeTraverse, where an LLM navigates the document's tree structure; DocETL [47], a declarative system for complex document processing; and GraphRanker, a graphbased method extended from HippoRAG [19] that applies Personalized PageRank [20] to rank the relevant nodes.", + "title_level": -1 + }, + "summary": "LayoutsegmentedRAG is a category of retrieval-augmented generation methods that leverage layout analysis to divide documents into discrete structural units for processing. Key approaches within this category include MM-Vanilla, which uses multi-modal embeddings for visual and textual data; TreeTraverse, an LLM-driven method navigating document tree structures; DocETL, a declarative system for complex processing; and GraphRanker, which extends HippoRAG by applying Personalized PageRank to rank relevant nodes." + }, + { + "index_id": 149, + "parent_id": 138, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 149, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Implementation details. For a fair comparison, both BookRAG and all baseline methods are powered by a unified set of state-of-theart (SOTA) and widely adopted backbone models from the Qwen family [4, 60, 63, 64]. We employ MinerU [52] for robust document layout parsing. We set the threshold of gradient 𝑔 as 0 6, and more . details are provided in the appendix of our technical report [57]. Our source code, prompts, and detailed configurations are available at github.com/sam234990/BookRAG.", + "title_level": -1 + }, + "summary": "To ensure a fair comparison, BookRAG and all baseline methods utilize unified Qwen family backbone models and MinerU for document layout parsing, with a gradient threshold set at 0.6; full implementation details, source code, and prompts are publicly available on GitHub." + }, + { + "index_id": 150, + "parent_id": 136, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 150, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "6.2 Overall results", + "title_level": 2 + }, + "summary": "Section 6.2 presents the overall results demonstrating that BookRAG achieves state-of-the-art performance in complex question answering, retrieval effectiveness, and query efficiency across multiple benchmarks by synergizing its unified Tree-Graph BookIndex with agent-based planning." + }, + { + "index_id": 151, + "parent_id": 150, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 151, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "In this section, we present a comprehensive evaluation of BookRAG, analyzing its complex QA performance, retrieval effectiveness, and query efficiency compared to state-of-the-art baselines.", + "title_level": -1 + }, + "summary": "BookRAG demonstrates superior performance over state-of-the-art baselines across complex question answering, retrieval effectiveness, and query efficiency." + }, + { + "index_id": 152, + "parent_id": 150, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 8, + "page_path": null, + "pdf_id": 152, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· QA Performance of BookRAG . We compare the QA performance of BookRAG against three categories of baselines, as shown in Table 5. The results indicate that BookRAG achieves state-of-the-art performance across all datasets, substantially outperforming the top-performing baseline by 18.0% in Exact Match on M3DocVQA. Layout + Vanilla consistently outperforms Vanilla RAG, confirming that layout parsing preserves essential structural information for better retrieval. Besides, the suboptimal results of Tree-Traverse and GraphRanker highlight the limitations of relying solely on hierarchical navigation or graph-based reasoning, which often miss cross-sectional context or drift into irrelevant scopes. In contrast, BookRAG's superiority stems from the synergy of its unified Tree-Graph BookIndex and Agent-based Planning. By effectively classifying queries and configuring optimal workflows, our BookRAG overcomes limitations of context fragmentation and static query workflow within existing baselines, ensuring precise evidence retrieval and accurate generation.", + "title_level": -1 + }, + "summary": "BookRAG achieves state-of-the-art QA performance across all tested datasets, outperforming the top baseline by 18.0% in Exact Match on M3DocVQA. This superiority is driven by its unified Tree-Graph BookIndex and Agent-based Planning, which synergize to classify queries and configure optimal workflows, thereby overcoming the context fragmentation and static limitations of existing methods. While layout parsing proves essential for preserving structural information compared to vanilla RAG, approaches relying solely on hierarchical navigation or graph-based reasoning fall short due to missed cross-sectional context. Ultimately, BookRAG ensures precise evidence retrieval and accurate generation by effectively integrating these advanced indexing and planning capabilities." + }, + { + "index_id": 153, + "parent_id": 150, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 9, + "page_path": null, + "pdf_id": 153, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Table 5: Performance comparison of different methods across various datasets for solving complex document QA tasks. The best and second-best results are marked in bold and underlined, respectively.", + "title_level": -1 + }, + "summary": "Table 5 presents a performance comparison of various methods on complex document QA tasks across multiple datasets, highlighting the top two performers in each category with bold and underlined formatting to indicate the best and second-best results, respectively." + }, + { + "index_id": 154, + "parent_id": 150, + "type": "NodeType.TABLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 9, + "page_path": null, + "pdf_id": 154, + "pdf_para_block": { + "docling_label": "table" + }, + "img_path": "", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/156'", + "footnote": "", + "table_body": "| Baseline Type | Method | MMLongBench | MMLongBench | M3DocVQA | M3DocVQA | Qasper | Qasper |\n|----------------------|------------------|---------------|---------------|---------------|------------|------------|------------|\n| | Method | (Exact Match) | (F1-score) | (Exact Match) | (F1-score) | (Accuracy) | (F1-score) |\n| Conventional RAG | BM25 | 18.3 | 20.2 | 34.6 | 37.8 | 38.1 | 42.5 |\n| Conventional RAG | Vanilla RAG | 16.5 | 18.0 | 36.5 | 40.2 | 40.6 | 44.4 |\n| | Layout + Vanilla | 18.1 | 19.8 | 36.9 | 40.2 | 40.7 | 44.6 |\n| Graph-based RAG | RAPTOR | 21.3 | 21.8 | 34.3 | 37.3 | 39.4 | 44.1 |\n| Graph-based RAG | GraphRAG-Local | 7.7 | 8.5 | 23.7 | 25.6 | 35.9 | 39.2 |\n| Graph-based RAG | GraphRAG-Global | 5.3 | 5.6 | 20.2 | 22.0 | 24.0 | 24.1 |\n| Layout segmented RAG | MM-Vanilla | 6.8 | 8.4 | 25.1 | 27.7 | 27.9 | 29.3 |\n| Layout segmented RAG | Tree-Traverse | 12.7 | 14.4 | 33.3 | 36.2 | 27.3 | 32.1 |\n| Layout segmented RAG | GraphRanker | 21.2 | 22.7 | 43.0 | 47.8 | 32.9 | 37.6 |\n| Layout segmented RAG | DocETL | 27.5 | 28.6 | 40.9 | 43.3 | 42.3 | 50.4 |\n| Our proposed | BookRAG | 43.8 | 44.9 | 61.0 | 66.2 | 55.2 | 61.1 |", + "content": "cref='#/texts/156'", + "title_level": -1 + }, + "summary": "BookRAG, the proposed method, significantly outperforms all conventional, graph-based, and layout-segmented RAG baselines across three long-document benchmarks (MMLongBench, M3DocVQA, and Qasper). While traditional approaches like BM25 and Vanilla RAG achieve modest scores (e.g., 18.3–40.6 accuracy/F1), and graph-based methods often underperform, BookRAG demonstrates superior capability in handling complex document retrieval and reasoning. Specifically, BookRAG achieves a 43.8 Exact Match and 44.9 F1-score on MMLongBench, 61.0 Exact Match and 66.2 F1-score on M3DocVQA, and 55.2 Accuracy and 61.1 F1-score on Qasper, marking a substantial improvement over the next best performer, DocETL, which trails by roughly 16–20 percentage points in most metrics." + }, + { + "index_id": 155, + "parent_id": 150, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 9, + "page_path": null, + "pdf_id": 155, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Table 6: Retrieval recall comparison among layout-based methods. The best and second-best results are marked in bold and underlined, respectively.", + "title_level": -1 + }, + "summary": "Layout-based retrieval methods exhibit varying performance levels, with specific approaches achieving the highest recall rates (marked in bold) and the next best results (marked in underlined) as detailed in the comparative analysis." + }, + { + "index_id": 156, + "parent_id": 150, + "type": "NodeType.TABLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 9, + "page_path": null, + "pdf_id": 156, + "pdf_para_block": { + "docling_label": "table" + }, + "img_path": "", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/158'", + "footnote": "", + "table_body": "| Method | MMLongBench | M3DocVQA | Qasper |\n|------------------|---------------|------------|----------|\n| Layout + Vanilla | 26.3 | 33.8 | 33.5 |\n| MM-Vanilla | 7.5 | 19.7 | 14.9 |\n| Tree-Traverse | 11.2 | 19.5 | 14.5 |\n| GraphRanker | 26.4 | 44.5 | 28.6 |\n| BookRAG | 57.6 | 71.2 | 63.5 |", + "content": "cref='#/texts/158'", + "title_level": -1 + }, + "summary": "BookRAG significantly outperforms all other evaluated methods across the MMLongBench, M3DocVQA, and Qasper benchmarks, achieving the highest scores of 57.6, 71.2, and 63.5 respectively. In contrast, baseline approaches like MM-Vanilla and Tree-Traverse yield substantially lower performance, while the Layout + Vanilla and GraphRanker methods show moderate results that remain well below BookRAG's capabilities." + }, + { + "index_id": 157, + "parent_id": 150, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 9, + "page_path": null, + "pdf_id": 157, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Retrieval performance of BookRAG. To validate our retrieval design, we evaluate the retrieval recall of BookRAG against other layout-based baselines on the ground-truth layout blocks. The experimental results demonstrate that BookRAG achieves the highest recall across all datasets, notably reaching 71.2% on M3DocVQA and significantly outperforming the next best baseline (GraphRanker, max44.5%). This performance advantage stems from our IFT-inspired Selector → Reasoner workflow: the Agent-based Planning first classifies the query, enabling the Selector to narrow the search to a precise information patch , followed by the Reasoner's analysis. Crucially, after the Skyline_Ranker process, the average number of retained nodes is 9.87, 6.86, and 8.6 across the three datasets, which is comparable to the standard top𝑘 ( 𝑘 = 10) setting, ensuring high-quality retrieval without inflating the candidate size.", + "title_level": -1 + }, + "summary": "BookRAG achieves superior retrieval performance compared to layout-based baselines, reaching a record 71.2% recall on the M3DocVQA dataset—significantly outperforming the next best method (GraphRanker) by over 26 percentage points. This success is driven by an IFT-inspired workflow where an Agent-based Planning module classifies queries to guide a Selector in pinpointing precise information patches, followed by analysis from a Reasoner. Furthermore, the system maintains high-quality retrieval efficiency by retaining an average of fewer than 10 nodes per query (9.87, 6.86, and 8.6 across datasets), matching standard top-10 settings without inflating candidate sizes." + }, + { + "index_id": 158, + "parent_id": 150, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 9, + "page_path": null, + "pdf_id": 158, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 5: Comparison of query efficiency.", + "title_level": -1 + }, + "summary": "Figure 5 illustrates a comparative analysis of query efficiency, highlighting performance differences across the evaluated methods." + }, + { + "index_id": 159, + "parent_id": 150, + "type": "NodeType.IMAGE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 9, + "page_path": null, + "pdf_id": 159, + "pdf_para_block": { + "docling_label": "picture" + }, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-5.png", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/161'", + "footnote": "", + "table_body": null, + "content": "cref='#/texts/161'", + "title_level": -1 + }, + "summary": "The provided content does not contain any substantive information to summarize, as it consists solely of a placeholder indicating an image with a technical reference code and lacks any descriptive text, data, or context." + }, + { + "index_id": 160, + "parent_id": 150, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 9, + "page_path": null, + "pdf_id": 160, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Efficiency of BookRAG. Wefurther evaluate the efficiency in terms of query time and token consumption, as illustrated in Figure 5. Overall, BookRAG maintains time and token costs comparable to existing Graph-based RAG methods. While purely text-based RAG approaches generally exhibit lower latency and token usage due to the absence of VLM processing for images, BookRAG maintains a balanced efficiency among multi-modal methods. In terms of token usage, BookRAG reduces consumption by an order of magnitude compared to the strongest baseline, DocETL. Notably, on the MMLongBench dataset, DocETL consumes over 53 million tokens, whereas BookRAG requires less than 5 million. Regarding the query latency, our method also achieves a speedup of up to 2 × compared to DocETL.", + "title_level": -1 + }, + "summary": "BookRAG achieves a balanced efficiency among multi-modal retrieval methods, maintaining query time and token costs comparable to existing Graph-based RAG approaches while significantly outperforming the strongest baseline, DocETL. Specifically, BookRAG reduces token consumption by an order of magnitude (using less than 5 million tokens versus over 53 million on the MMLongBench dataset) and accelerates query latency by up to 2×, despite the inherent overhead of processing images with Vision-Language Models." + }, + { + "index_id": 161, + "parent_id": 150, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 9, + "page_path": null, + "pdf_id": 161, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "10", + "title_level": -1 + }, + "summary": "The provided input consists solely of the number \"10\" and lacks sufficient context, text, or data to generate a meaningful summary or identify a core subject." + }, + { + "index_id": 162, + "parent_id": 136, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 162, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "6.3 Detailed Analysis", + "title_level": 2 + }, + "summary": "Section 6.3 presents a comprehensive evaluation of the BookRAG system through ablation studies, gradient-based entity resolution analysis, query-type performance comparisons, and error analysis to validate the necessity of each component and demonstrate the architecture's superiority over its variants." + }, + { + "index_id": 163, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 163, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "In this section, we provide a more in-depth examination of our BookRAG. We first conduct an ablation study to validate the contribution of each component, followed by an experiment on the impact of gradient-based ER and QA performance across different query types. Furthermore, we perform a comprehensive error analysis, compare the effectiveness of our entity resolution method, and present a case study.", + "title_level": -1 + }, + "summary": "The section presents a comprehensive evaluation of BookRAG, validating its individual components through an ablation study, analyzing the effects of gradient-based entity resolution and question answering performance across various query types, and supporting these findings with a detailed error analysis, a comparison of entity resolution methods, and a case study." + }, + { + "index_id": 164, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 164, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Ablation study. To evaluate the contribution of each core component in BookRAG, we design several variants by removing specific components:", + "title_level": -1 + }, + "summary": "An ablation study was conducted to evaluate the individual contribution of each core component within the BookRAG system by systematically removing specific elements to assess their impact." + }, + { + "index_id": 165, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 165, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· w/o Gradient ER: Replaces the gradient-based entity resolution with a Basic ER by merging the same-name entities.", + "title_level": -1 + }, + "summary": "The \"w/o Gradient ER\" variant simplifies entity resolution by replacing the gradient-based approach with a basic method that merges entities solely based on identical names." + }, + { + "index_id": 166, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 166, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· w/o Planning: Removes the Agent-based Planning, defaulting to a static, standard workflow for all queries.", + "title_level": -1 + }, + "summary": "Disabling planning removes the Agent-based approach, causing the system to revert to a static, standard workflow for all queries." + }, + { + "index_id": 167, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 167, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· w/o Selector : Removes the Selector operators, forcing Reasoners to score all candidate nodes.", + "title_level": -1 + }, + "summary": "Removing Selector operators forces Reasoners to score all candidate nodes rather than filtering them first." + }, + { + "index_id": 168, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 168, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· w/o Graph_Reasoning : Removes the Graph_Reasoning operator. Consequently, the Skyline_Ranker is also disabled as scoring becomes single-dimensional.", + "title_level": -1 + }, + "summary": "Removing the Graph_Reasoning operator disables the Skyline_Ranker, forcing scoring to become single-dimensional." + }, + { + "index_id": 169, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 169, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· w/o Text_Reasoning : Removes the Text_Reasoning operator. Similarly, the Skyline_Ranker is disabled, relying solely on graph-based scores.", + "title_level": -1 + }, + "summary": "Disabling the Text_Reasoning operator and the Skyline_Ranker forces the system to rely exclusively on graph-based scores for ranking." + }, + { + "index_id": 170, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 170, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Table 7: Comparing the QA performance of different variants of BookRAG. EM and F1 denote Exact Match and F1-score, respectively.", + "title_level": -1 + }, + "summary": "BookRAG variants demonstrate varying QA performance, with results evaluated using Exact Match (EM) and F1-score metrics as detailed in Table 7." + }, + { + "index_id": 171, + "parent_id": 162, + "type": "NodeType.TABLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 171, + "pdf_para_block": { + "docling_label": "table" + }, + "img_path": "", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/220'", + "footnote": "", + "table_body": "| Method variants | MMLongBench | MMLongBench | Qasper | Qasper |\n|---------------------|---------------|---------------|----------|----------|\n| | EM | F1 | Accuracy | F1 |\n| BookRAG (Full) | 43.8 | 44.9 | 55.2 | 61.1 |\n| w/o gradient ER | 40.1 | 42.8 | 48.9 | 57.3 |\n| w/o Planning | 30.8 | 33.2 | 40.9 | 48.5 |\n| w/o Selector | 42.5 | 43.1 | 52.5 | 59.1 |\n| w/o Graph_Reasoning | 39.8 | 41.5 | 51.4 | 58.4 |\n| w/o Text_Reasoning | 39.0 | 40.3 | 47.2 | 52.5 |", + "content": "cref='#/texts/220'", + "title_level": -1 + }, + "summary": "The BookRAG (Full) method achieves the highest performance across all evaluated metrics on the MMLongBench and Qasper datasets, demonstrating that its complete architecture is superior to any of its component variants. Specifically, removing the \"Planning\" module causes the most significant performance degradation (dropping MMLongBench EM to 30.8 and Qasper Accuracy to 40.9), indicating it is the most critical component for the system's effectiveness. While other modules like Gradient ER, Selector, Graph Reasoning, and Text Reasoning also contribute positively to the model's accuracy and F1 scores, their individual removal results in less severe performance drops compared to the absence of planning." + }, + { + "index_id": 172, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 172, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "The first variant evaluates the impact of KG quality on retrieval performance. The second and third variants assess the necessity of our Agent-based Planning and IFT-inspired selection mechanism, respectively. Finally, the last two variants validate the effectiveness of our multi-dimensional reasoning and dynamic Skyline filtering strategy. As shown in Table 7, the performance degradation across all variants confirms the essential role of each module in BookRAG. Specifically, the performance drop in the w/o Gradient ER variant highlights the critical role of a high-quality, connectivity-rich KG in supporting effective reasoning. Removing the Planning mechanism results in the most significant performance loss, confirming that a static workflow is insufficient for handling diverse types of queries. The w/o Selector variant, while maintaining competitive accuracy, incurs a prohibitive computational cost ( > 2 × tokens on Qasper), validating the efficiency of our IFT-inspired \"narrow-then-reason\" strategy.", + "title_level": -1 + }, + "summary": "The ablation study confirms that every module in BookRAG is essential for optimal performance, as removing any component leads to significant degradation. Specifically, a high-quality Knowledge Graph is critical for effective reasoning, while the Agent-based Planning mechanism is indispensable for handling diverse queries, as static workflows cause the most severe performance drops. Additionally, the IFT-inspired selection strategy proves vital for efficiency, maintaining competitive accuracy while avoiding the prohibitive computational costs associated with its absence." + }, + { + "index_id": 173, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 173, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "11", + "title_level": -1 + }, + "summary": "The provided input consists solely of the number \"11\" and contains no substantive content, context, or data to summarize." + }, + { + "index_id": 174, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 174, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 6: Comparison of graph statistics. Values are normalized to the Basic setting (Baseline=1.0). Absolute values for Basic are annotated. Note that density values are abbreviated (e.g., 3.6E-3 denotes 3 6 . × 10 -3 ).", + "title_level": -1 + }, + "summary": "Figure 6 presents a comparative analysis of graph statistics across different settings, using the \"Basic\" configuration as a normalized baseline of 1.0. The data highlights relative performance changes, with absolute values for the Basic setting provided for reference, while density metrics are expressed in scientific notation (e.g., 3.6E-3) to accommodate small magnitudes." + }, + { + "index_id": 175, + "parent_id": 162, + "type": "NodeType.IMAGE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 175, + "pdf_para_block": { + "docling_label": "picture" + }, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-6.png", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/224'", + "footnote": "", + "table_body": null, + "content": "cref='#/texts/224'", + "title_level": -1 + }, + "summary": "The provided content does not contain sufficient information to generate a summary, as it consists only of a placeholder indicating an image is present along with a reference code, without any actual text, data, or descriptive caption to analyze." + }, + { + "index_id": 176, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 176, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Impact of Gradient-based Entity Resolution. To evaluate the quality of our constructed KG, we compare the graph statistics of our Gradient-based ER against a Basic KG construction. The Basic setting employs simple exact name matching for entity merging, which is standard practice in many graph-based methods. Figure 6 presents the comparative results, normalizing the metrics (Entity count, Density, Diameter of the Largest Connected Component, and Number of Connected Components) against the Basic baseline. The results demonstrate that our Gradient-based ER significantly optimizes KG. Specifically, it reduces the number of entities (by 12%) while substantially boosting graph density (by over 20% across datasets). This structural shift indicates that our ER module effectively identifies the same conceptual entities that possess different names. Consequently, the resulting graphs are more compact and cohesive, as evidenced by the reduced diameter and fewer connected components, which mitigates graph fragmentation and facilitates better connectivity for graph reasoning.", + "title_level": -1 + }, + "summary": "Gradient-based Entity Resolution significantly optimizes Knowledge Graph construction by reducing entity counts by 12% and increasing graph density by over 20% compared to standard exact name matching. This approach effectively merges conceptually identical entities with different names, resulting in more compact and cohesive graphs with reduced diameter and fewer disconnected components, thereby mitigating fragmentation and enhancing connectivity for graph reasoning." + }, + { + "index_id": 177, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 177, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 7: QA performance breakdown by different query types (Single-hop, Multi-hop, and Global). The blue bars represent Exact Match (EM) for MMLongBench and Accuracy for Qasper, while the red bars represent the F1-score.", + "title_level": -1 + }, + "summary": "Figure 7 illustrates the performance breakdown of question-answering systems across three query types—Single-hop, Multi-hop, and Global—using MMLongBench and Qasper datasets. The analysis employs Exact Match (EM) and Accuracy metrics, represented by blue bars, alongside F1-score metrics, represented by red bars, to evaluate model effectiveness on varying levels of query complexity." + }, + { + "index_id": 178, + "parent_id": 162, + "type": "NodeType.IMAGE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 178, + "pdf_para_block": { + "docling_label": "picture" + }, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-7.png", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/259'", + "footnote": "", + "table_body": null, + "content": "cref='#/texts/259'", + "title_level": -1 + }, + "summary": "The provided content does not contain any substantive information to summarize, as it consists solely of a placeholder indicating an image with a broken or empty caption reference." + }, + { + "index_id": 179, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 179, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· QA performance under different query types. Figure 7 breaks down the performance of BookRAG across Single-hop, Multihop, and Global aggregation query types. We observe that Multihop queries generally present a greater challenge compared to Single-hop ones, resulting in a slight performance decrease. This trend reflects the inherent difficulty of retrieving and reasoning over disjoint pieces of evidence. It further validates our agent-based planning strategy, which handles different query types separately.", + "title_level": -1 + }, + "summary": "BookRAG demonstrates that multihop queries are inherently more challenging than single-hop queries due to the difficulty of retrieving and reasoning over disjoint evidence, leading to a slight performance decrease; this trend validates the effectiveness of its agent-based planning strategy, which is designed to handle different query types separately." + }, + { + "index_id": 180, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 10, + "page_path": null, + "pdf_id": 180, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Error Response analysis. To diagnose the performance bottlenecks of BookRAG, we conduct a fine-grained error analysis on 200 sampled queries from each dataset, tracing the error propagation as shown in Figure 9. We categorize failures into four types:", + "title_level": -1 + }, + "summary": "To diagnose BookRAG's performance bottlenecks, a fine-grained error analysis was conducted on 200 sampled queries per dataset, tracing error propagation and categorizing failures into four distinct types." + }, + { + "index_id": 181, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 11, + "page_path": null, + "pdf_id": 181, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 8: Case study of responses across different query types from MMLongBench and Qasper. CYAN TEXT highlights correct content generated by BookRAG. GRAY TEXT describes the internal process, and marks omitted irrelevant parts.", + "title_level": -1 + }, + "summary": "BookRAG demonstrates superior performance in generating accurate responses for diverse query types on the MMLongBench and Qasper benchmarks, as evidenced by its ability to correctly identify and output relevant content (highlighted in cyan) while effectively filtering out irrelevant internal processing details." + }, + { + "index_id": 182, + "parent_id": 162, + "type": "NodeType.IMAGE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 11, + "page_path": null, + "pdf_id": 182, + "pdf_para_block": { + "docling_label": "picture" + }, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-8.png", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/282'", + "footnote": "", + "table_body": null, + "content": "cref='#/texts/282'", + "title_level": -1 + }, + "summary": "The provided content does not contain any substantive information to summarize, as it consists solely of a placeholder indicating an image with a technical reference code and lacks any descriptive text, data, or context." + }, + { + "index_id": 183, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 11, + "page_path": null, + "pdf_id": 183, + "pdf_para_block": { + "docling_label": "caption" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 9: Error analysis on 200 sampled queries from MMLongBench and Qasper datasets.", + "title_level": -1 + }, + "summary": "Error analysis of 200 sampled queries from the MMLongBench and Qasper datasets reveals specific performance patterns and failure modes, as detailed in Figure 9." + }, + { + "index_id": 184, + "parent_id": 162, + "type": "NodeType.IMAGE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 11, + "page_path": null, + "pdf_id": 184, + "pdf_para_block": { + "docling_label": "picture" + }, + "img_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output/docling/images/BOOKRAG_VLDB_2026_full-picture-9.png", + "image_width": 0, + "image_height": 0, + "caption": "cref='#/texts/348'", + "footnote": "", + "table_body": null, + "content": "cref='#/texts/348'", + "title_level": -1 + }, + "summary": "The provided content consists solely of a placeholder indicating an image with a reference caption, but it contains no actual text, data, or visual information to summarize." + }, + { + "index_id": 185, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 11, + "page_path": null, + "pdf_id": 185, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "PDF Parsing, Plan, Retrieval, and Generation errors. The results identify Retrieval Error as the dominant failure mode, followed by Generation Error, reflecting the persistent challenge of locating and synthesizing multimodal evidence. Regarding Plan Error, our qualitative analysis reveals a specific failure pattern: the planner tends to over-decompose detailed single-hop queries into unnecessary multi-hop sub-tasks. This fragmentation leads to disjointed retrieval paths, effectively preventing the model from synthesizing a cohesive final answer from the scattered sub-responses.", + "title_level": -1 + }, + "summary": "Retrieval Error is the dominant failure mode in PDF parsing, planning, retrieval, and generation tasks, followed by Generation Error, highlighting the ongoing difficulty in locating and synthesizing multimodal evidence. Additionally, Plan Errors frequently stem from the planner's tendency to over-decompose simple single-hop queries into unnecessary multi-hop sub-tasks; this fragmentation creates disjointed retrieval paths that prevent the model from effectively synthesizing a cohesive final answer." + }, + { + "index_id": 186, + "parent_id": 162, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 11, + "page_path": null, + "pdf_id": 186, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "· Case study. Figure 8 illustrates BookRAG's answering workflow across Single-hop, Multi-hop, and Global queries. The results demonstrate that by leveraging specific operators ( Select , Decompose , and Filter ), BookRAG effectively prunes search spaces. For example, in the Single-hop case, the reasoning space is significantly reduced from 134 to 24 nodes. This capability allows the system to efficiently isolate relevant evidence from noise, ensuring precise answer generation.", + "title_level": -1 + }, + "summary": "BookRAG effectively prunes search spaces and ensures precise answer generation by leveraging specific operators (Select, Decompose, and Filter) to isolate relevant evidence from noise. As demonstrated in a case study, this approach significantly reduces reasoning complexity—for instance, shrinking the search space from 134 to 24 nodes in single-hop queries—thereby enabling efficient handling of single-hop, multi-hop, and global queries." + }, + { + "index_id": 187, + "parent_id": 1, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 11, + "page_path": null, + "pdf_id": 187, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "7 CONCLUSION", + "title_level": 1 + }, + "summary": "This section concludes the study by summarizing BookRAG's superior performance and outlining future directions for an integrated database system." + }, + { + "index_id": 188, + "parent_id": 187, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 11, + "page_path": null, + "pdf_id": 188, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "In this paper, we propose BookRAG, a novel method built upon Book Index, a document-native, structured Tree-Graph index specifically designed to capture the intricate relations of structural documents. By employing an agent-based method to dynamically configure retrieval and reasoning operators, our approach achieves state-ofthe-art performance on multiple benchmarks, demonstrating significant superiority over existing baselines in both retrieval precision and answer accuracy. In the future, we will explore an integrated document-native database system that supports data formatting, knowledge extraction, and intelligent querying.", + "title_level": -1 + }, + "summary": "BookRAG is a novel, agent-based method that leverages a document-native Tree-Graph index called Book Index to capture complex structural relationships, achieving state-of-the-art performance with superior retrieval precision and answer accuracy compared to existing baselines; future work aims to develop an integrated database system supporting data formatting, knowledge extraction, and intelligent querying." + }, + { + "index_id": 189, + "parent_id": 187, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 11, + "page_path": null, + "pdf_id": 189, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "12", + "title_level": -1 + }, + "summary": "The provided content consists solely of the number \"12\" and lacks sufficient context, text, or data to form a meaningful summary or draw any conclusions." + }, + { + "index_id": 190, + "parent_id": 1, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 190, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "REFERENCES", + "title_level": 1 + }, + "summary": "This section compiles a comprehensive bibliography of recent academic and industry research (2020–2025) that advances Large Language Models through innovations in Retrieval-Augmented Generation, graph-based reasoning, multimodal understanding, and domain-specific data processing." + }, + { + "index_id": 191, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 191, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[1] Simran Arora, Brandon Yang, Sabri Eyuboglu, Avanika Narayan, Andrew Hojel, Immanuel Trummer, and Christopher Ré. 2023. Language Models Enable Simple Systems for Generating Structured Views of Heterogeneous Data Lakes. Proceedings of the VLDB Endowment 17, 2 (2023), 92-105.", + "title_level": -1 + }, + "summary": "The 2023 study by Arora et al. demonstrates that large language models (LLMs) can power simple, efficient systems to automatically generate structured views from heterogeneous data lakes, effectively bridging the gap between unstructured data sources and user-friendly analytical interfaces without requiring complex, custom-built pipelines." + }, + { + "index_id": 192, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 192, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[2] Akari Asai, Zeqiu Wu, Yizhong Wang, et al. 2024. Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection. In International Conference on Learning Representations (ICLR) .", + "title_level": -1 + }, + "summary": "The paper \"Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection\" (Asai et al., 2024) introduces a novel framework that enables language models to autonomously decide when to retrieve external information, how to generate responses, and how to critique their own outputs. By integrating retrieval, generation, and self-reflection into a single end-to-end trainable system, Self-RAG significantly improves the quality, relevance, and factual accuracy of generated text while reducing hallucinations and unnecessary retrieval, outperforming existing methods that treat these components as separate stages." + }, + { + "index_id": 193, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 193, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[3] Akari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, and Hannaneh Hajishirzi. 2023. Self-rag: Learning to retrieve, generate, and critique through self-reflection. arXiv preprint arXiv:2310.11511 (2023).", + "title_level": -1 + }, + "summary": "Self-RAG is a framework that enhances retrieval-augmented generation by enabling models to dynamically retrieve information, generate responses, and critically evaluate their own output through self-reflection, thereby improving accuracy and relevance without requiring external supervision." + }, + { + "index_id": 194, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 194, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[4] Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, et al. 2025. Qwen2.5-vl technical report. arXiv preprint arXiv:2502.13923 (2025).", + "title_level": -1 + }, + "summary": "The Qwen2.5-VL technical report (arXiv:2502.13923, 2025) introduces an advanced vision-language model developed by a team led by Shuai Bai and Keqin Chen, detailing its architecture, capabilities, and performance benchmarks as a significant evolution in multimodal AI systems." + }, + { + "index_id": 195, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 195, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[5] Camille Barboule, Benjamin Piwowarski, and Yoan Chabot. 2025. Survey on Question Answering over Visually Rich Documents: Methods, Challenges, and Trends. arXiv preprint arXiv:2501.02235 (2025).", + "title_level": -1 + }, + "summary": "The 2025 survey by Barboule, Piwowarski, and Chot provides a comprehensive overview of Question Answering (QA) over Visually Rich Documents (VRDs), systematically analyzing current methodologies, identifying persistent challenges in processing complex layouts and multimodal data, and outlining emerging trends that are shaping the future of this field." + }, + { + "index_id": 196, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 196, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[6] Yukun Cao, Zengyi Gao, Zhiyang Li, Xike Xie, S. Kevin Zhou, and Jianliang Xu. 2025. LEGO-GraphRAG: Modularizing Graph-Based Retrieval-Augmented Generation for Design Space Exploration. Proc. VLDB Endow. 18, 10 (June 2025), 3269-3283. https://doi.org/10.14778/3748191.3748194", + "title_level": -1 + }, + "summary": "LEGO-GraphRAG is a novel framework introduced in a 2025 *Proceedings of the VLDB Endowment* paper that modularizes graph-based retrieval-augmented generation (RAG) to facilitate efficient design space exploration. By decomposing the RAG pipeline into interchangeable components, the approach enables researchers to systematically evaluate and optimize various architectural configurations, thereby accelerating the development of more effective graph-enhanced language models." + }, + { + "index_id": 197, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 197, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[7] Chengliang Chai, Jiajun Li, Yuhao Deng, Yuanhao Zhong, Ye Yuan, Guoren Wang, and Lei Cao. 2025. Doctopus: Budget-aware structural table extraction from unstructured documents. Proceedings of the VLDB Endowment 18, 11 (2025), 3695-3707.", + "title_level": -1 + }, + "summary": "Doctopus is a novel framework introduced in 2025 that enables budget-aware structural table extraction from unstructured documents, addressing the challenge of balancing extraction accuracy with computational resource constraints. By optimizing the trade-off between the cost of processing and the quality of the resulting table structures, the system allows for efficient and scalable table recovery in large-scale document analysis, as demonstrated in their research published in the Proceedings of the VLDB Endowment." + }, + { + "index_id": 198, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 198, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[8] Ilias Chalkidis, Manos Fergadiotis, Prodromos Malakasiotis, Nikolaos Aletras, and Ion Androutsopoulos. 2020. LEGAL-BERT: The muppets straight out of law school. arXiv preprint arXiv:2010.02559 (2020).", + "title_level": -1 + }, + "summary": "The 2020 arXiv preprint \"LEGAL-BERT: The muppets straight out of law school\" by Chalkidis et al. introduces LEGAL-BERT, a domain-specific language model adapted from BERT to better understand legal texts, thereby enhancing performance on legal natural language processing tasks." + }, + { + "index_id": 199, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 199, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[9] Sibei Chen, Yeye He, Weiwei Cui, Ju Fan, Song Ge, Haidong Zhang, Dongmei Zhang, and Surajit Chaudhuri. 2024. Auto-Formula: Recommend Formulas in Spreadsheets using Contrastive Learning for Table Representations. Proceedings of the ACM on Management of Data 2, 3 (2024), 1-27.", + "title_level": -1 + }, + "summary": "The 2024 paper \"Auto-Formula\" introduces a novel approach for recommending spreadsheet formulas by employing contrastive learning to generate robust table representations, thereby significantly improving the accuracy of formula suggestions in data management tasks." + }, + { + "index_id": 200, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 200, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[10] Sibei Chen, Nan Tang, Ju Fan, Xuemi Yan, Chengliang Chai, Guoliang Li, and Xiaoyong Du. 2023. Haipipe: Combining human-generated and machine-generated pipelines for data preparation. Proceedings of the ACM on Management of Data 1, 1 (2023), 1-26.", + "title_level": -1 + }, + "summary": "The 2023 paper \"Haipipe\" introduces a novel data preparation framework that synergistically combines human expertise with machine-generated pipelines to enhance efficiency and accuracy. By integrating human-guided strategies with automated machine learning techniques, Haipipe addresses the limitations of purely manual or fully automated approaches, offering a robust solution for complex data preparation tasks as demonstrated in the Proceedings of the ACM on Management of Data." + }, + { + "index_id": 201, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 201, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[11] Jaemin Cho, Debanjan Mahata, Ozan Irsoy, Yujie He, and Mohit Bansal. 2024. M3docrag: Multi-modal retrieval is what you need for multi-page multidocument understanding. arXiv preprint arXiv:2411.04952 (2024).", + "title_level": -1 + }, + "summary": "The 2024 paper *M3DocRAG* by Cho et al. establishes that multi-modal retrieval is essential for effectively understanding complex, multi-page, multi-document scenarios, proposing a framework that integrates visual and textual information to overcome the limitations of text-only approaches in document comprehension." + }, + { + "index_id": 202, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 202, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[12] Vassilis Christophides, Vasilis Efthymiou, Themis Palpanas, George Papadakis, and Kostas Stefanidis. 2020. An overview of end-to-end entity resolution for big data. ACM Computing Surveys (CSUR) 53, 6 (2020), 1-42.", + "title_level": -1 + }, + "summary": "End-to-end entity resolution for big data requires a unified, scalable framework that integrates data cleaning, matching, and merging into a single pipeline to overcome the limitations of traditional multi-stage approaches. This paradigm addresses the unique challenges of massive, heterogeneous datasets by leveraging distributed computing, advanced machine learning techniques, and efficient indexing strategies, ultimately enabling accurate identification of real-world entities across diverse sources while maintaining high performance and scalability." + }, + { + "index_id": 203, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 203, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[13] Gheorghe Comanici, Eric Bieber, Mike Schaekermann, Ice Pasupat, Noveen Sachdeva, Inderjit Dhillon, Marcel Blistein, Ori Ram, Dan Zhang, Evan Rosen, et al. 2025. Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities. arXiv preprint arXiv:2507.06261 (2025).", + "title_level": -1 + }, + "summary": "The 2025 arXiv preprint by Comanici et al. introduces Gemini 2.5, a next-generation AI model that significantly advances the field through enhanced reasoning capabilities, robust multimodality, support for extended context windows, and sophisticated agentic functionalities." + }, + { + "index_id": 204, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 204, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[14] Pradeep Dasigi, Kyle Lo, Iz Beltagy, Arman Cohan, Noah A Smith, and Matt Gardner. 2021. A dataset of information-seeking questions and answers anchored in research papers. arXiv preprint arXiv:2105.03011 (2021).", + "title_level": -1 + }, + "summary": "Dasigi et al. (2021) introduced a novel dataset comprising information-seeking questions and answers that are explicitly anchored in research papers, designed to advance research on question answering within the scientific literature." + }, + { + "index_id": 205, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 205, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[15] Xavier Daull, Patrice Bellot, Emmanuel Bruno, Vincent Martin, and Elisabeth Murisasco. 2023. Complex QA and language models hybrid architectures, Survey. arXiv preprint arXiv:2302.09051 (2023).", + "title_level": -1 + }, + "summary": "The 2023 survey by Daull et al. establishes that hybrid architectures, which integrate Large Language Models (LLMs) with specialized components like retrieval systems, knowledge graphs, or symbolic reasoning engines, represent the most effective approach for solving complex question-answering tasks. While standalone LLMs excel at fluency and general knowledge, they often struggle with multi-hop reasoning, factual accuracy, and handling domain-specific data; consequently, the authors conclude that combining LLMs with external tools or structured methods is essential to overcome these limitations and achieve robust performance in complex QA scenarios." + }, + { + "index_id": 206, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 206, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[16] Darren Edge, Ha Trinh, Newman Cheng, Joshua Bradley, Alex Chao, Apurva Mody, Steven Truitt, and Jonathan Larson. 2024. From local to global: A graph rag approach to query-focused summarization. arXiv preprint arXiv:2404.16130 (2024).", + "title_level": -1 + }, + "summary": "The paper \"From local to global: A graph rag approach to query-focused summarization\" (Edge et al., 2024) introduces a novel Graph RAG (Retrieval-Augmented Generation) framework that enhances query-focused summarization by leveraging graph structures to capture both local context and global relationships within data, thereby improving the coherence and accuracy of generated summaries compared to traditional methods." + }, + { + "index_id": 207, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 207, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[17] Yunfan Gao, Yun Xiong, Xinyu Gao, Kangxiang Jia, Jinliu Pan, Yuxi Bi, Yi Dai, Jiawei Sun, and Haofen Wang. 2023. Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997 (2023).", + "title_level": -1 + }, + "summary": "The 2023 survey by Gao et al. establishes that Retrieval-Augmented Generation (RAG) is a critical paradigm for enhancing Large Language Models (LLMs) by integrating external knowledge retrieval, effectively addressing limitations such as hallucinations, outdated information, and lack of domain-specific expertise while improving factual accuracy and interpretability." + }, + { + "index_id": 208, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 208, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[18] Zirui Guo, Lianghao Xia, Yanhua Yu, Tu Ao, and Chao Huang. 2024. LightRAG: Simple and Fast Retrieval-Augmented Generation. arXiv e-prints (2024), arXiv2410.", + "title_level": -1 + }, + "summary": "LightRAG is a 2024 arXiv paper proposing a retrieval-augmented generation (RAG) framework that achieves simplicity and speed by utilizing a lightweight graph-based retrieval mechanism, offering an efficient alternative to traditional RAG systems for enhancing large language model performance." + }, + { + "index_id": 209, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 209, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[19] Bernal Jiménez Gutiérrez, Yiheng Shu, Yu Gu, Michihiro Yasunaga, and Yu Su. 2024. HippoRAG: Neurobiologically Inspired Long-Term Memory for Large Language Models. arXiv preprint arXiv:2405.14831 (2024).", + "title_level": -1 + }, + "summary": "HippoRAG is a novel retrieval-augmented generation framework that enhances large language models' long-term memory by mimicking the hippocampus's neurobiological mechanisms, specifically utilizing pattern separation and completion to efficiently store, retrieve, and integrate vast amounts of information without catastrophic forgetting." + }, + { + "index_id": 210, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 210, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[20] Taher H Haveliwala. 2002. Topic-sensitive pagerank. In Proceedings of the 11th international conference on World Wide Web . 517-526.", + "title_level": -1 + }, + "summary": "Taher Haveliwala's 2002 paper introduces Topic-Sensitive PageRank, an enhancement to the standard PageRank algorithm that assigns different importance scores to web pages based on the specific topic of the user's query, thereby improving search relevance by tailoring rankings to distinct subject areas rather than relying on a single, global page authority score." + }, + { + "index_id": 211, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 211, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[21] Xiaoxin He, Yijun Tian, Yifei Sun, Nitesh V Chawla, Thomas Laurent, Yann LeCun, Xavier Bresson, and Bryan Hooi. 2024. G-retriever: Retrieval-augmented generation for textual graph understanding and question answering. arXiv preprint arXiv:2402.07630 (2024).", + "title_level": -1 + }, + "summary": "G-retriever is a 2024 framework that enhances textual graph understanding and question answering by integrating retrieval-augmented generation (RAG) with graph data, allowing models to dynamically retrieve and reason over relevant graph structures alongside text." + }, + { + "index_id": 212, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 212, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[22] Yucheng Hu and Yuxing Lu. 2024. Rag and rau: A survey on retrieval-augmented language model in natural language processing. arXiv preprint arXiv:2404.19543 (2024).", + "title_level": -1 + }, + "summary": "Hu and Lu (2024) present a comprehensive survey on Retrieval-Augmented Generation (RAG) and Retrieval-Augmented Understanding (RAU), analyzing their transformative impact on Natural Language Processing by integrating external knowledge retrieval with language models to enhance accuracy, reduce hallucinations, and improve performance on complex tasks." + }, + { + "index_id": 213, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 213, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[23] Soyeong Jeong, Jinheon Baek, et al. 2024. Adaptive-RAG: Learning to Adapt Retrieval-Augmented Large Language Models through Question Complexity. arXiv preprint arXiv:2403.14403 (2024).", + "title_level": -1 + }, + "summary": "The paper \"Adaptive-RAG\" (2024) by Soyeong Jeong, Jinheon Baek, et al. introduces a framework that dynamically adjusts the retrieval strategy of Large Language Models based on the complexity of the input question. By learning to distinguish between simple queries, which can be answered with minimal or no external information, and complex queries requiring extensive context, the method optimizes the trade-off between retrieval cost and answer accuracy, outperforming static retrieval-augmented generation approaches." + }, + { + "index_id": 214, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 214, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "13", + "title_level": -1 + }, + "summary": "The provided content consists solely of the number \"13\" and lacks sufficient context, text, or data to form a meaningful summary or draw any conclusions." + }, + { + "index_id": 215, + "parent_id": 190, + "type": "NodeType.TABLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 215, + "pdf_para_block": { + "docling_label": "table" + }, + "img_path": "", + "image_width": 0, + "image_height": 0, + "caption": "", + "footnote": "", + "table_body": "| [24] | Soyeong Jeong, Jinheon Baek, Sukmin Cho, Sung Ju Hwang, and Jong C Park. 2024. Adaptive-rag: Learning to adapt retrieval-augmented large language mod- |\n|--------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| [25] | els through question complexity. arXiv preprint arXiv:2403.14403 (2024). Tengjun Jin, Yuxuan Zhu, and Daniel Kang. 2025. ELT-Bench: An End-to- End Benchmark for Evaluating AI Agents on ELT Pipelines. arXiv preprint |\n| [26] | arXiv:2504.04808 (2025). Geewook Kim, Teakgyu Hong, Moonbin Yim, JeongYeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, and Seunghyun Park. 2022. Ocr-free document understanding transformer. In European Confer- |\n| [27] | ence on Computer Vision . Springer, 498-517. Dawei Li, Shu Yang, Zhen Tan, Jae Young Baik, Sukwon Yun, Joseph Lee, Aaron Chacko, Bojian Hou, Duy Duong-Tran, Ying Ding, et al. 2024. DALK: Dynamic Co-Augmentation of LLMs and KG to answer Alzheimer's Disease Questions with Scientific Literature. arXiv preprint arXiv:2405.04819 (2024). |\n| [28] | Guoliang Li, Jiayi Wang, Chenyang Zhang, and Jiannan Wang. 2025. Data+ AI: LLM4Data and Data4LLM. In Companion of the 2025 International Conference on |\n| [29] | Management of Data . 837-843. Yinheng Li, Shaofei Wang, Han Ding, and Hang Chen. 2023. Large language models in finance: A survey. In Proceedings of the fourth ACM international conference on AI in finance . 374-382. |\n| [30] | Zhaodonghui Li, Haitao Yuan, Huiming Wang, Gao Cong, and Lidong Bing. 2025. LLM-R2: A Large Language Model Enhanced Rule-based Rewrite System for Boosting Query Efficiency. Proceedings of the VLDB Endowment 1, 18 (2025), 53-65. |\n| [31] | Haoyu Lu, Wen Liu, Bo Zhang, et al. 2024. DeepSeek-VL: Towards Real-World Vision-Language Understanding. arXiv preprint arXiv:2403.05525 (2024). |\n| [32] | Shengjie Ma, Chengjin Xu, Xuhui Jiang, Muzhi Li, Huaren Qu, Cehao Yang, Jiaxin Mao, and Jian Guo. 2024. Think-on-Graph 2.0: Deep and Faithful Large Language Model Reasoning with Knowledge-guided Retrieval Augmented Generation. arXiv preprint arXiv:2407.10805 (2024). |\n| [33] | Yubo Ma, Yuhang Zang, Liangyu Chen, Meiqi Chen, Yizhu Jiao, Xinze Li, Xinyuan Lu, Ziyu Liu, Yan Ma, Xiaoyi Dong, et al. 2024. Mmlongbench-doc: Benchmarking long-context document understanding with visualizations. Advances in Neural Information Processing Systems 37 (2024), 95963-96010. |\n| [34] | Alex Mallen, Akari Asai, Victor Zhong, Rajarshi Das, Daniel Khashabi, and Hannaneh Hajishirzi. 2022. When not to trust language models: Investigat- ing effectiveness of parametric and non-parametric memories. arXiv preprint arXiv:2212.10511 (2022). |\n| [35] | Zan Ahmad Naeem, Mohammad Shahmeer Ahmad, Mohamed Eltabakh, Mourad Ouzzani, and Nan Tang. 2024. RetClean: Retrieval-Based Data Cleaning Using LLMs and Data Lakes. Proceedings of the VLDB Endowment 17, 12 (2024), 4421- 4424. |\n| [36] | Avanika Narayan, Ines Chami, Laurel Orr, and Christopher Ré. 2022. Can Foun- dation Models Wrangle Your Data? Proceedings of the VLDB Endowment 16, 4 (2022), 738-746. |\n| [37] | Yuqi Nie, Yaxuan Kong, Xiaowen Dong, John M Mulvey, H Vincent Poor, Qing- song Wen, and Stefan Zohren. 2024. A Survey of Large Language Models for Financial Applications: Progress, Prospects and Challenges. arXiv preprint |\n| [38] | arXiv:2406.11903 (2024). Arash Dargahi Nobari and Davood Rafiei. 2024. TabulaX: Leveraging Large Language Models for Multi-Class Table Transformations. arXiv preprint arXiv:2411.17110 (2024). |\n| [39] | PageIndex. 2025. PageIndex: Next-Generation Reasoning-based RAG. https: //pageindex.ai/. |\n| [40] | Liana Patel, Siddharth Jha, Melissa Pan, Harshit Gupta, Parth Asawa, Carlos Guestrin, and Matei Zaharia. 2025. Semantic Operators and Their Optimization: Enabling LLM-Based Data Processing with Accuracy Guarantees in LOTUS. |\n| [41] | Proceedings of the VLDB Endowment 18, 11 (2025), 4171-4184. Boci Peng, Yun Zhu, Yongchao Liu, Xiaohe Bo, Haizhou Shi, Chuntao Hong, Yan Zhang, and Siliang Tang. 2024. Graph retrieval-augmented generation: A survey. |\n| [42] | arXiv preprint arXiv:2408.08921 (2024). Peter Pirolli and Stuart Card. 1995. Information foraging in information access environments. In Proceedings of the SIGCHI conference on Human factors in computing systems . 51-58. |\n| [43] | Yichen Qian, Yongyi He, Rong Zhu, Jintao Huang, Zhijian Ma, Haibin Wang, Framework for Data Manipulation with Large Language Models. |\n| | Yaohua Wang, Xiuyu Sun, Defu Lian, Bolin Ding, et al. 2024. UniDM: A Unified Proceedings of Machine Learning and Systems 6 (2024), 465-482. |\n| [44] | Stephen E Robertson and Steve Walker. 1994. Some simple effective approxi- mations to the 2-poisson model for probabilistic weighted retrieval. In SIGIR'94: Proceedings of the Seventeenth Annual International ACM-SIGIR Conference on Research and Development in Information Retrieval, organised by Dublin City |\n| [45] | University . Springer, 232-241. Parth Sarthi, Salman Abdullah, Aditi Tuli, Shubh Khanna, Anna Goldie, and Christopher D Manning. 2024. Raptor: Recursive abstractive processing for |\n| | tree-organized retrieval. arXiv preprint arXiv:2401.18059 (2024). |", + "content": "", + "title_level": -1 + }, + "summary": "The provided content is a bibliography of recent academic research (2022–2025) focusing on the integration of Large Language Models (LLMs) with data management, retrieval systems, and domain-specific applications. Key themes include the evolution of Retrieval-Augmented Generation (RAG) through adaptive mechanisms, graph-based reasoning, and recursive processing to handle complex queries and long-context documents. Significant attention is given to data-centric AI, covering LLM-driven data cleaning, transformation, and pipeline evaluation (ELT), as well as specialized applications in finance and healthcare (e.g., Alzheimer's research). Additionally, the list highlights advancements in vision-language understanding, the optimization of semantic operators for accuracy, and foundational studies on information foraging and the reliability of parametric versus non-parametric memory in language models." + }, + { + "index_id": 216, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 12, + "page_path": null, + "pdf_id": 216, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "[46] Timo Schick, Jane Dwivedi-Yu, Roberto Dessì, Roberta Raileanu, Maria Lomeli, Eric Hambro, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom. 2024.", + "title_level": -1 + }, + "summary": "The 2024 work by Timo Schick, Jane Dwivedi-Yu, Roberto Dessì, Roberta Raileanu, Maria Lomeli, Eric Hambro, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom represents a collaborative research contribution by these authors, though the specific title, methodology, or findings of the study are not provided in the excerpt." + }, + { + "index_id": 217, + "parent_id": 190, + "type": "NodeType.TABLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 13, + "page_path": null, + "pdf_id": 217, + "pdf_para_block": { + "docling_label": "table" + }, + "img_path": "", + "image_width": 0, + "image_height": 0, + "caption": "", + "footnote": "", + "table_body": "| | Toolformer: Language models can teach themselves to use tools. Advances in Neural Information Processing Systems 36 (2024). |\n|------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| [47] | Shreya Shankar, Tristan Chambers, Tarak Shah, Aditya G Parameswaran, and Eugene Wu. 2024. Docetl: Agentic query rewriting and evaluation for complex document processing. arXiv preprint arXiv:2410.12189 (2024). |\n| [48] | Shamane Siriwardhana, Rivindu Weerasekera, Elliott Wen, Tharindu Kalu- arachchi, Rajib Rana, and Suranga Nanayakkara. 2023. Improving the domain adaptation of retrieval augmented generation (RAG) models for open domain question answering. Transactions of the Association for Computational Linguistics 11 (2023), 1-17. |\n| [49] | Solutions Review Editors. 2019. 80 Percent of Your Data Will Be Unstructured in Five Years. https://solutionsreview.com/data-management/80-percent-of-your- datawill-be-unstructured-in-five-years/. Accessed: 2023-10-27. |\n| [50] | Zhaoyan Sun, Xuanhe Zhou, and Guoliang Li. 2024. R-Bot: An LLM-based Query |\n| [51] | Rewrite System. arXiv preprint arXiv:2412.01661 (2024). Vincent A Traag, Ludo Waltman, and Nees Jan Van Eck. 2019. From Louvain to Leiden: guaranteeing well-connected communities. Scientific reports 9, 1 (2019), 1-12. |\n| [52] | Bin Wang, Chao Xu, Xiaomeng Zhao, Linke Ouyang, Fan Wu, Zhiyuan Zhao, Rui Xu, Kaiwen Liu, Yuan Qu, Fukai Shang, et al. 2024. Mineru: An open-source solution for precise document content extraction. arXiv preprint arXiv:2409.18839 (2024). |\n| [53] | Jiayi Wang and Guoliang Li. 2025. Aop: Automated and interactive llm pipeline orchestration for answering complex queries. CIDR. |\n| [54] | Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024). |\n| [55] | Shu Wang, Yixiang Fang, Yingli Zhou, Xilin Liu, and Yuchi Ma. 2025. ArchRAG: Attributed Community-based Hierarchical Retrieval-Augmented Generation. arXiv preprint arXiv:2502.09891 (2025). |\n| [56] | Shen Wang, Tianlong Xu, Hang Li, Chaoli Zhang, Joleen Liang, Jiliang Tang, Philip S Yu, and Qingsong Wen. 2024. Large language models for education: A survey and outlook. arXiv preprint arXiv:2403.18105 (2024). |", + "content": "", + "title_level": -1 + }, + "summary": "The provided content is a bibliography of recent academic and industry research (2019–2025) focused on advancing Large Language Models (LLMs) through tool integration, document processing, and retrieval-augmented generation (RAG). Key themes include enabling models to autonomously use tools (Toolformer), developing agentic systems for complex query rewriting and document extraction (Docetl, R-Bot, Mineru), and improving domain adaptation for open-domain question answering. The collection also highlights advancements in vision-language models (Qwen2-vl), hierarchical RAG architectures (ArchRAG), automated pipeline orchestration (Aop), and the growing prevalence of unstructured data, alongside foundational work in community detection and educational applications of LLMs." + }, + { + "index_id": 218, + "parent_id": 190, + "type": "NodeType.TABLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 13, + "page_path": null, + "pdf_id": 218, + "pdf_para_block": { + "docling_label": "table" + }, + "img_path": "", + "image_width": 0, + "image_height": 0, + "caption": "", + "footnote": "", + "table_body": "| [57] | Shu Wang, Yingli Zhou, and Yixiang Fang. [n. d.]. BookRAG: A Hierarchical Structure-aware Index-based Approach for Complex Document Question An- swering. https://github.com/sam234990/BookRAG. |\n|--------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| [58] | Yu Wang, Nedim Lipka, Ryan A Rossi, Alexa Siu, Ruiyi Zhang, and Tyler Derr. 2024. Knowledge graph prompting for multi-document question answering. In Proceedings of the AAAI Conference on Artificial Intelligence , Vol. 38. 19206-19214. |\n| [59] | Shi-Qi Yan, Jia-Chen Gu, Yun Zhu, and Zhen-Hua Ling. 2024. Corrective Retrieval Augmented Generation. arXiv preprint arXiv:2401.15884 (2024). |\n| [60] | An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al. 2025. Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025). |\n| [61] | Murong Yue. 2025. A survey of large language model agents for question an- swering. arXiv preprint arXiv:2503.19213 (2025). |\n| [62] | Qinggang Zhang, Shengyuan Chen, Yuanchen Bei, Zheng Yuan, Huachi Zhou, Zijin Hong, Hao Chen, Yilin Xiao, Chuang Zhou, Junnan Dong, et al. 2025. A survey of graph retrieval-augmented generation for customized large language models. arXiv preprint arXiv:2501.13958 (2025). |\n| [63] | Xin Zhang, Yanzhao Zhang, Wen Xie, Mingxin Li, Ziqi Dai, Dingkun Long, Pengjun Xie, Meishan Zhang, Wenjie Li, and Min Zhang. 2024. GME: Im- proving Universal Multimodal Retrieval by Multimodal LLMs. arXiv preprint |\n| [64] | Yanzhao Zhang, Mingxin Li, Dingkun Long, Xin Zhang, Huan Lin, Baosong Yang, Pengjun Xie, An Yang, Dayiheng Liu, Junyang Lin, et al. 2025. Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models. arXiv preprint arXiv:2506.05176 (2025). |\n| [65] | Wayne Xin Zhao, Kun Zhou, Junyi Li, Tianyi Tang, Xiaolei Wang, Yupeng Hou, Yingqian Min, Beichen Zhang, Junjie Zhang, Zican Dong, et al. 2023. A survey of large language models. arXiv preprint arXiv:2303.18223 1, 2 (2023). |\n| [66] | Yingli Zhou, Yaodong Su, Youran Sun, Shu Wang, Taotao Wang, Runyuan He, Yongwei Zhang, Sicong Liang, Xilin Liu, Yuchi Ma, et al. 2025. In-depth Analysis of Graph-based RAG in a Unified Framework. arXiv preprint arXiv:2503.04338 (2025). |\n| [67] | Yutao Zhu, Huaying Yuan, Shuting Wang, Jiongnan Liu, Wenhan Liu, Chenlong Deng, Haonan Chen, Zheng Liu, Zhicheng Dou, and Ji-Rong Wen. 2023. Large language models for information retrieval: A survey. ACM Transactions on Information Systems (2023). |", + "content": "", + "title_level": -1 + }, + "summary": "The provided bibliography highlights a rapidly evolving research landscape focused on enhancing Large Language Models (LLMs) for complex question answering through advanced retrieval and structural techniques. Key themes include the integration of **Graph Retrieval-Augmented Generation (Graph RAG)** to leverage hierarchical document structures and knowledge graphs for multi-document reasoning, as well as the development of specialized **embedding models** (e.g., Qwen3 Embedding) to improve multimodal and universal retrieval capabilities. The collection also features comprehensive surveys on LLM agents, corrective retrieval mechanisms, and the broader application of LLMs in information retrieval, alongside technical reports on next-generation models like Qwen3, indicating a strong industry and academic shift toward more structured, accurate, and context-aware AI systems." + }, + { + "index_id": 219, + "parent_id": 190, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 13, + "page_path": null, + "pdf_id": 219, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "14", + "title_level": -1 + }, + "summary": "The provided content consists solely of the number \"14\" and lacks any descriptive text, context, or data necessary to form an informative summary." + }, + { + "index_id": 220, + "parent_id": 1, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 220, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "A EXPERIMENTAL DETAILS", + "title_level": 1 + }, + "summary": "This section details the experimental framework of BookRAG by defining evaluation metrics, outlining answer extraction and normalization procedures, specifying system prompts for query processing, and documenting the hardware and software configurations used to ensure reproducible RAG model assessment." + }, + { + "index_id": 221, + "parent_id": 220, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 221, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "A.1 Evaluation Metrics", + "title_level": 2 + }, + "summary": "This section defines the evaluation metrics and outlines the answer extraction and normalization procedures required to accurately compare RAG model outputs against ground truth labels." + }, + { + "index_id": 222, + "parent_id": 221, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 222, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "In this section, we provide the detailed definitions and calculation procedures for the metrics used in our main experiments.", + "title_level": -1 + }, + "summary": "This section defines the specific metrics and outlines the calculation procedures employed in the main experiments." + }, + { + "index_id": 223, + "parent_id": 221, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 223, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "A.1.1 Answer Extraction and Normalization. Standard RAG models typically generate free-form natural language responses, which may contain extraneous conversational text (e.g., 'The answer is...'). Directly comparing these raw outputs with concise ground truth labels (e.g., 'Option A' or '12.5') can lead to false negatives.", + "title_level": -1 + }, + "summary": "Standard RAG models often produce verbose, conversational responses that hinder accurate evaluation against concise ground truth labels; therefore, answer extraction and normalization are essential to strip extraneous text and enable reliable comparison." + }, + { + "index_id": 224, + "parent_id": 221, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 224, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Following official evaluation protocols, we employ an LLM-based extraction step to align the model output with the ground truth format before calculation. Let 𝑦 𝑟𝑎𝑤 denote the raw response generated by the RAG system and 𝑦 𝑔𝑜𝑙𝑑 denote the ground truth. We define the extracted answer ˆ as: 𝑦 ˆ 𝑦 = LLMextract ( 𝑦 𝑟𝑎𝑤 , Instruction ) (16) where LLMextract extracts the key information (e.g., the key entity for span extraction) from 𝑦 𝑟𝑎𝑤 . We further apply standard normalization N(·) (e.g., lowercasing, removing punctuation) to both ˆ 𝑦 and 𝑦 𝑔𝑜𝑙𝑑 .", + "title_level": -1 + }, + "summary": "To ensure accurate evaluation of RAG systems, raw model outputs are first processed by an LLM to extract key information into a standardized format, after which both the extracted answer and the ground truth undergo normalization (such as lowercasing and punctuation removal) to enable precise comparison." + }, + { + "index_id": 225, + "parent_id": 221, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 225, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "ˆ 𝑦 = LLMextract ( 𝑦 𝑟𝑎𝑤 , Instruction ) (16)", + "title_level": -1 + }, + "summary": "The equation $y = \\text{LLMextract}(y_{\\text{raw}}, \\text{Instruction})$ defines a process where a Large Language Model (LLM) transforms raw input data ($y_{\\text{raw}}$) into a structured or refined output ($y$) by strictly adhering to a provided instruction." + }, + { + "index_id": 226, + "parent_id": 220, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 226, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "A.1.2 QA Performance Metrics. Based on the ground truth 𝑦 𝑔𝑜𝑙𝑑 and the model's response (either raw 𝑦 𝑟𝑎𝑤 or extracted ˆ), we com𝑦 pute the following metrics: Accuracy = 1 𝑁 𝑁 ∑︁ 𝑖 = 1 I (N( 𝑦 𝑔𝑜𝑙𝑑,𝑖 ) ⊆ N( 𝑦 𝑟𝑎𝑤,𝑖 )) (17) where ⊆ denotes the substring inclusion relation.", + "title_level": 2 + }, + "summary": "Section A.1.2 defines the QA performance metrics, specifically detailing the calculation of accuracy via substring inclusion, exact match for strict character-level agreement, and F1-score for token-based span extraction." + }, + { + "index_id": 227, + "parent_id": 226, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 227, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Accuracy (Inclusion-based). Following prior works [3, 34, 46], we utilize accuracy as a soft-match metric. We consider a prediction correct if the normalized gold answer is included in the model's generated response, rather than requiring a strict exact match. This accounts for the uncontrollable nature of LLM generation.", + "title_level": -1 + }, + "summary": "Accuracy is evaluated using an inclusion-based soft-match metric, where a prediction is deemed correct if the normalized gold answer appears within the model's generated response, accommodating the inherent variability of LLM outputs rather than requiring strict exact matches." + }, + { + "index_id": 228, + "parent_id": 226, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 228, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Accuracy = 1 𝑁 𝑁 ∑︁ 𝑖 = 1 I (N( 𝑦 𝑔𝑜𝑙𝑑,𝑖 ) ⊆ N( 𝑦 𝑟𝑎𝑤,𝑖 )) (17)", + "title_level": -1 + }, + "summary": "Accuracy is defined as the proportion of instances where the set of normalized gold-standard labels is a subset of the set of normalized raw model predictions, calculated by summing the indicator function for this subset relationship across all $N$ instances and dividing by the total count." + }, + { + "index_id": 229, + "parent_id": 226, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 229, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Exact Match (EM).. Unlike accuracy, Exact Match is a strict metric. It measures whether the normalized extracted answer ˆ is character𝑦 for-character identical to the ground truth.", + "title_level": -1 + }, + "summary": "Exact Match (EM) is a strict evaluation metric that determines correctness by checking if the normalized extracted answer is character-for-character identical to the ground truth, distinguishing it from more lenient measures like accuracy." + }, + { + "index_id": 230, + "parent_id": 226, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 230, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "EM = 1 𝑁 𝑁 ∑︁ 𝑖 = 1 I (N( ˆ 𝑦 𝑖 ) = N( 𝑦 𝑔𝑜𝑙𝑑,𝑖 )) (18)", + "title_level": -1 + }, + "summary": "The Equation (18) defines the Exact Match (EM) metric as the proportion of instances where the predicted label $\\hat{y}_i$ perfectly matches the ground truth label $y_{gold,i}$ across a dataset of size $N$, serving as a strict evaluation measure that awards a score of 1 only for exact label agreement and 0 otherwise." + }, + { + "index_id": 231, + "parent_id": 226, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 231, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "F1-score. For questions requiring text span answers, we utilize the token-level F1-score between the extracted answer ˆ and the 𝑦 ground truth 𝑦 𝑔𝑜𝑙𝑑 . Treating them as bags of tokens 𝑇 ˆ 𝑦 and 𝑇 𝑔𝑜𝑙𝑑 : 𝑃 = | 𝑇 ˆ 𝑦 ∩ 𝑇 𝑔𝑜𝑙𝑑 | | 𝑇 ˆ 𝑦 | , 𝑅 = | 𝑇 ˆ 𝑦 ∩ 𝑇 𝑔𝑜𝑙𝑑 | | 𝑇 𝑔𝑜𝑙𝑑 | , F1 = 2 · 𝑃 · 𝑅 𝑃 + 𝑅 (19)", + "title_level": -1 + }, + "summary": "For text span extraction tasks, the F1-score is calculated by treating both the predicted answer and the ground truth as bags of tokens; precision and recall are derived from the intersection of these token sets, and the final F1-score is the harmonic mean of these two metrics." + }, + { + "index_id": 232, + "parent_id": 226, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 232, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "𝑃 = | 𝑇 ˆ 𝑦 ∩ 𝑇 𝑔𝑜𝑙𝑑 | | 𝑇 ˆ 𝑦 | , 𝑅 = | 𝑇 ˆ 𝑦 ∩ 𝑇 𝑔𝑜𝑙𝑑 | | 𝑇 𝑔𝑜𝑙𝑑 | , F1 = 2 · 𝑃 · 𝑅 𝑃 + 𝑅 (19)", + "title_level": -1 + }, + "summary": "The provided formulas define three key evaluation metrics for classification performance: Precision ($P$) measures the proportion of predicted positive instances that are actually correct; Recall ($R$) measures the proportion of actual positive instances that were successfully identified; and the F1 score ($F1$) serves as the harmonic mean of Precision and Recall, providing a single balanced metric that accounts for both false positives and false negatives." + }, + { + "index_id": 233, + "parent_id": 226, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 233, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "15", + "title_level": -1 + }, + "summary": "The provided content consists solely of the number \"15\" and lacks sufficient context, text, or data to form a meaningful summary or draw any conclusions." + }, + { + "index_id": 234, + "parent_id": 220, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 234, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "A.1.3 Retrieval Recall. As described in the main text, we evaluate retrieval quality based on the granularity of parsed PDF blocks (e.g., paragraphs, tables, images). For a given query 𝑞 , let B 𝑔𝑜𝑙𝑑 be the set of manually labeled ground-truth blocks required to answer 𝑞 , and B 𝑟𝑒𝑡 be the set of unique blocks retrieved by the system. The Retrieval Recall is defined as: Recall 𝑟𝑒𝑡 = ( 0 if parsing error occurs on B 𝑔𝑜𝑙𝑑 | B 𝑟𝑒𝑡 ∩B 𝑔𝑜𝑙𝑑 | | B 𝑔𝑜𝑙𝑑 | otherwise (20)", + "title_level": 2 + }, + "summary": "This section defines the Retrieval Recall metric, which penalizes systems with a score of zero if any ground-truth PDF block is lost during parsing, otherwise calculating recall as the ratio of retrieved gold-standard blocks to the total required." + }, + { + "index_id": 235, + "parent_id": 234, + "type": "NodeType.EQUATION", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 235, + "pdf_para_block": { + "docling_label": "formula" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Recall 𝑟𝑒𝑡 = ( 0 if parsing error occurs on B 𝑔𝑜𝑙𝑑 | B 𝑟𝑒𝑡 ∩B 𝑔𝑜𝑙𝑑 | | B 𝑔𝑜𝑙𝑑 | otherwise (20)", + "title_level": -1 + }, + "summary": "The recall metric ($r_{et}$) is defined as zero if a parsing error occurs on the gold standard set, and as the ratio of correctly parsed items to the total gold standard items otherwise." + }, + { + "index_id": 236, + "parent_id": 234, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 236, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Specifically, if a ground-truth block is lost due to PDF parsing failures (i.e., it does not exist in the candidate pool), it is considered strictly unretrievable, resulting in a recall contribution of 0 for that specific block.", + "title_level": -1 + }, + "summary": "If a ground-truth block is lost during PDF parsing and consequently absent from the candidate pool, it is deemed strictly unretrievable, resulting in a recall contribution of zero for that specific block." + }, + { + "index_id": 237, + "parent_id": 220, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 237, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "A.2 Implementation details", + "title_level": 2 + }, + "summary": "This section details the implementation framework of BookRAG, specifying the Python-based architecture, model selection rationale, experimental hardware configuration, and standardized evaluation protocols to ensure reproducibility." + }, + { + "index_id": 238, + "parent_id": 237, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 238, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Weimplement BookRAG in Python, utilizing MinerU [52] for robust document layout parsing. For a fair comparison, both BookRAG and all baseline methods are powered by a unified set of state-of-theart (SOTA) and widely adopted backbone models from the Qwen family [4, 60, 63, 64], including LLM, vision-language model (VLM), and embedding models. Specifically, we utilize Qwen3-8B [60] as the default LLM, Qwen2.5VL-30B [4] as the vision-language model (VLM), Qwen3-Embedding-0.6B [64] for text embedding, gme-Qwen2-VL-2B-Instruct [63] for multi-modal embedding, and Qwen3-Reranker-4B [64] for reranking. We primarily select models under the 10B parameter scale to balance efficiency and effectiveness. However, for the VLM, we adopt the 30B version, as the 8B counterpart exhibited significant performance deficits, frequently failing to answer correctly even when provided with ground-truth images. All experiments were conducted on a Linux operating system running on a high-performance server equipped with an Intel Xeon 2.0GHz CPU, 1024GB of memory, and 8 NVIDIA GeForce RTX A5000 GPUs, each with 24 GB of VRAM. Specifically, to ensure a fair comparison of efficiency, all methods were executed serially, and the reported time costs reflect this sequential processing mode. For methods involving document chunking and retrieval ranking, we standardize the chunk size at 500 tokens and set the retrieval top𝑘 to 10 to ensure consistent candidate pool sizes across baselines. For further reproducibility, our source code and detailed implementation configurations are publicly available at our repository: https://github.com/sam234990/BookRAG.", + "title_level": -1 + }, + "summary": "BookRAG is implemented in Python using MinerU for document parsing and evaluated against baselines using a unified suite of Qwen family models (LLM, VLM, embedding, and reranking) to ensure fair comparison. While most components utilize models under 10B parameters to balance efficiency and effectiveness, the 30B Qwen2.5VL vision-language model was selected over the 8B version due to the latter's significant performance failures. Experiments were conducted on a high-performance Linux server with 8 NVIDIA RTX A5000 GPUs, employing serial execution to accurately measure time costs, standardized chunking (500 tokens), and retrieval top-k settings (10) for consistency. The full source code and implementation details are publicly available for reproducibility." + }, + { + "index_id": 239, + "parent_id": 220, + "type": "NodeType.TITLE", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 239, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "A.3 Prompts", + "title_level": 2 + }, + "summary": "Section A.3 presents the specific system prompts designed to guide the four critical stages of query processing: agent-based classification, question decomposition, filter operator generation, and entity resolution judgment." + }, + { + "index_id": 240, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 14, + "page_path": null, + "pdf_id": 240, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Specifically, we present the prompts designed for agent-based query classification (Figure 10), question decomposition (Figure 11), and filter operator generation (Figure 12). Additionally, we illustrate the prompt employed for entity resolution judgment (Figure 13) during the graph construction phase.", + "title_level": -1 + }, + "summary": "The study presents specific prompts designed to facilitate four critical stages of the system: agent-based query classification, question decomposition, filter operator generation, and entity resolution judgment during graph construction." + }, + { + "index_id": 241, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 241, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "You are an expert query analyzer. Your only task is to classify the user's question into one of three categories: \"simple\", \"complex\", or \"global\". Respond only with the specified JSON object.", + "title_level": -1 + }, + "summary": "The user's query is a system instruction directing an AI to act as an expert query analyzer that classifies questions into \"simple,\" \"complex,\" or \"global\" categories, requiring the output to be strictly a JSON object containing only the classification result." + }, + { + "index_id": 242, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 242, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Category Definitions:", + "title_level": -1 + }, + "summary": "The provided content is incomplete as it only lists the heading \"Category Definitions\" without supplying the actual definitions, categories, or data required to generate a summary." + }, + { + "index_id": 243, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 243, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "1. single-hop: The question can be fully answered by retrieving information from a SINGLE, contiguous location in the document (e.g., one specific paragraph, one complete table, or one figure).", + "title_level": -1 + }, + "summary": "A single-hop question is one that can be fully answered by retrieving information from a single, contiguous location within a document, such as a specific paragraph, table, or figure." + }, + { + "index_id": 244, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 244, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "-This includes questions that require reasoning or comparison, as long as all the necessary data is present within that single retrieved location.", + "title_level": -1 + }, + "summary": "Questions requiring reasoning or comparison are valid as long as all necessary data to answer them is contained within a single retrieved location." + }, + { + "index_id": 245, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 245, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "-Example: \"What is the title of Figure 2?\"", + "title_level": -1 + }, + "summary": "The provided content does not contain any substantive information to summarize, as it only consists of an example question (\"What is the title of Figure 2?\") and formatting instructions rather than actual data, text, or an image." + }, + { + "index_id": 246, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 246, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "-Example: \"How do 5% of the Latinos see economic upward mobility for their children?\" -> This is SIMPLE because the answer can be found by looking at a single chart or paragraph.", + "title_level": -1 + }, + "summary": "The example question regarding how 5% of Latinos view economic upward mobility for their children is classified as simple because its answer can be directly extracted from a single chart or paragraph." + }, + { + "index_id": 247, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 247, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "2. multi-hop: The question requires decomposition into multiple simple sub-questions, where each sub-question must be answered by a separate retrieval action.", + "title_level": -1 + }, + "summary": "Multi-hop questions require decomposition into multiple simple sub-questions, each necessitating a separate retrieval action to answer." + }, + { + "index_id": 248, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 248, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "-It often contains a nested or indirect constraint that requires a preliminary step to resolve before the main question can be answered.", + "title_level": -1 + }, + "summary": "The content describes a problem structure featuring a nested or indirect constraint, which necessitates completing a preliminary step to resolve the issue before the main question can be answered." + }, + { + "index_id": 249, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 249, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "-Example: \"What is the color of the personality vector...?\" -> This is COMPLEX because it requires two separate retrieval actions.", + "title_level": -1 + }, + "summary": "Determining the color of a personality vector is a complex task because it necessitates two distinct retrieval actions." + }, + { + "index_id": 250, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 250, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "3. global: The question requires an aggregation operation (e.g., counting, listing, summarizing) over a set of items that are identified by a clear structural filter.", + "title_level": -1 + }, + "summary": "The \"global\" category defines questions requiring aggregation operations, such as counting, listing, or summarizing, performed on a set of items identified by a clear structural filter." + }, + { + "index_id": 251, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 251, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "-Example: \"How many tables are in the document?\" -> This is GLOBAL because the process is to filter for all items of type 'table'.", + "title_level": -1 + }, + "summary": "The example illustrates a global query, where the task involves filtering the entire document to count all items of a specific type, such as tables, rather than focusing on a localized section." + }, + { + "index_id": 252, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 252, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "User Query: query", + "title_level": -1 + }, + "summary": "The provided content consists solely of a placeholder label (\"User Query: query\") and contains no substantive information, data, or context to summarize." + }, + { + "index_id": 253, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 253, + "pdf_para_block": { + "docling_label": "paragraph" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 10: The prompt for query classification.", + "title_level": -1 + }, + "summary": "Figure 10 presents the specific prompt designed to classify user queries." + }, + { + "index_id": 254, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 15, + "page_path": null, + "pdf_id": 254, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "16", + "title_level": -1 + }, + "summary": "The provided content consists solely of the number \"16\" and lacks sufficient context, text, or data to form a meaningful summary or draw any conclusions." + }, + { + "index_id": 255, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 16, + "page_path": null, + "pdf_id": 255, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "You are a query decomposition expert. You have been given a \"complex\" question. Your task is to break it down into a series of simple, atomic sub-questions and classify each one by type. **Crucial Instructions:** 1. Each ' retrieval ' sub-question MUST be a direct information retrieval task that can be answered independently by looking up a specific fact, number, or value in the document. 2. ** ' retrieval ' sub-questions MUST NOT depend on the answer of another sub-question.** They should be parallelizable. All logic for combining their results must be placed in a final ' synthesis ' question. 3. A ' synthesis ' question requires comparing, calculating, or combining the answers of the previous ' retrieval ' questions. It does **NOT** require a new lookup in the document. You MUST provide your response in a JSON object with a single key 'sub_questions', which contains a list of objects. Each object must have a 'question' (string) and a 'type' (string: \"retrieval\" or \"synthesis\"). ---EXAMPLE 1 (Correct Decomposition with Independent Lookups) ---Complex Query: \"What is the color of the personality vector in the soft-labled personality embedding matrix that with the highest Receptiviti score for User A2GBIFL43U1LKJ?\" Expected JSON Output: {{ \"sub_questions\": [ {{\"question\": \"What are all the Receptiviti scores for each personality vector for User A2GBIFL43U1LKJ?\", \"type\": \"retrieval\"}}, {{\"question\": \"What is the mapping of personality vectors to their colors in the soft-labled personality embedding matrix?\", \"type\": \"retrieval\"}}, {{\"question\": \"From the gathered scores, identify the personality vector with the highest score, and then find its corresponding color from the vector-to-color mapping.\", \"type\": \"synthesis\"}} ] }} ---END EXAMPLE 1 ------EXAMPLE 2 (Decomposition with retrieval and synthesis steps) ---Complex Query: \"According to the report, which one is greater in population in the survey? Foreign born Latinos, or the Latinos interviewed by cellphone?\" Expected JSON Output: {{ \"sub_questions\": [ {{\"question\": \"According to the report, what is the population of foreign born Latinos in the survey?\", \"type\": \"retrieval\"}}, {{\"question\": \"According to the report, what is the population of Latinos interviewed by cellphone in the survey?\", \"type\": \"retrieval\"}}, {{\"question\": \"Which of the two population counts is greater?\", \"type\": \"synthesis\"}} ] }} ---END EXAMPLE 2 --Now, perform the decomposition for the following query. User Query: query", + "title_level": -1 + }, + "summary": "The task requires decomposing a complex query into a JSON-formatted list of atomic sub-questions, strictly categorized as either \"retrieval\" or \"synthesis.\" \"Retrieval\" sub-questions must be independent, parallelizable fact-finding tasks that do not rely on other answers, while the final \"synthesis\" question must combine, compare, or calculate the results of the retrieval steps without requiring new document lookups." + }, + { + "index_id": 256, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 16, + "page_path": null, + "pdf_id": 256, + "pdf_para_block": { + "docling_label": "paragraph" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 11: The prompt for query decomposition.", + "title_level": -1 + }, + "summary": "Figure 11 illustrates the specific prompt designed to facilitate query decomposition, a technique used to break down complex questions into simpler, manageable sub-queries for more effective processing." + }, + { + "index_id": 257, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 16, + "page_path": null, + "pdf_id": 257, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "17", + "title_level": -1 + }, + "summary": "The provided content consists solely of the number \"17\" and lacks sufficient context, text, or data to form a meaningful summary or draw any conclusions." + }, + { + "index_id": 258, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 17, + "page_path": null, + "pdf_id": 258, + "pdf_para_block": { + "docling_label": "code" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "You are a highly specialized AI assistant. Your only function is to analyze a \"Global Query\" and return a single, valid JSON object that specifies both the filtering steps and the final aggregation operation. You MUST NOT output any other text or explanation. ### INSTRUCTIONS \\& DEFINITIONS ### 1. **Filters**: You MUST determine the list of ' filters ' to apply. Even if the filter is for the whole document (e.g., all tables), the ' filters ' list must be present. -' filter_type ' : One of [\"section\", \"image\", \"table\", \"page\"]. -' section ' : Use for structural parts like chapters, sections, appendices, or references. -' image ' : Use for visual elements like figures, images, pictures, or plots. -' table ' : Use for tabular data. -' page ' : Use for specific page numbers or ranges. -' filter_value ' : (Optional) Can be provided for \"section\" (e.g., a section title) or \"page\" (e.g., '3-10' or '5'). **For \"image\" or \"table\", this value MUST be null.** 2. **Operation**: Determine the final aggregation operation. -' operation ' : One of [\"COUNT\", \"LIST\", \"SUMMARIZE\", \"ANALYZE\"]. ### EXAMPLES OF YOUR TASK ### User: \"How many figures are in this paper from Page 3 to Page 10?\" Assistant: {{\"filters\": [{{\"filter_type\": \"page\", \"filter_value\": \"3-10\"}}, {{\"filter_type\": \"image\"}}], \"operation\": \"COUNT\"}} User: \"Summarize the discussion about 'data augmentation' in the 'Methodology' section.\" Assistant: {{\"filters\": [{{\"filter_type\": \"section\", \"filter_value\": \"Methodology\"}}], \"operation\": \"SUMMARIZE\"}} User: \"How many chapters are in this report?\" Assistant: {{\"filters\": [{{\"filter_type\": \"section\"}}], \"operation\": \"COUNT\"}} ### YOUR CURRENT TASK ### User: \"{query}\" User Query: query", + "title_level": -1 + }, + "summary": "The system is a specialized AI assistant designed to process user queries by generating a single, valid JSON object containing specific filtering criteria and an aggregation operation, with no additional text or explanation. The JSON structure requires a \"filters\" list, where each filter specifies a type (\"section\", \"image\", \"table\", or \"page\") and an optional value (required for sections and pages, but null for images and tables), and an \"operation\" field set to one of four actions: \"COUNT\", \"LIST\", \"SUMMARIZE\", or \"ANALYZE\"." + }, + { + "index_id": 259, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 17, + "page_path": null, + "pdf_id": 259, + "pdf_para_block": { + "docling_label": "paragraph" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 12: The prompt for Filter operator generation.", + "title_level": -1 + }, + "summary": "Figure 12 presents the specific prompt designed to generate the Filter operator." + }, + { + "index_id": 260, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 17, + "page_path": null, + "pdf_id": 260, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "18", + "title_level": -1 + }, + "summary": "The provided content consists solely of the number \"18\" and lacks sufficient context, narrative, or data to form a meaningful summary or draw any conclusions." + }, + { + "index_id": 261, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 261, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "-Goal-", + "title_level": -1 + }, + "summary": "The provided content contains only a goal header without any actual information, data, or context to summarize." + }, + { + "index_id": 262, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 262, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "You are an expert Entity Resolution Adjudicator. Your task is to determine if a \"New Entity\" refers to the exact same real-world concept as one of the \"Candidate Entities\" provided from a knowledge graph. Your output must be a JSON object containing the ID of the matching candidate (or -1) and a brief explanation for your decision. -ContextYou will be given one \"New Entity\" recently extracted from a text. You will also be given a list of \"Candidate Entities\" that are semantically similar, retrieved from an existing knowledge base. Each candidate has a unique ' id ' for you to reference.", + "title_level": -1 + }, + "summary": "The task requires an expert Entity Resolution Adjudicator to determine if a \"New Entity\" extracted from text refers to the same real-world concept as any \"Candidate Entity\" from a knowledge graph. The adjudicator must output a JSON object containing the ID of the matching candidate (or -1 if no match exists) along with a brief explanation of the decision." + }, + { + "index_id": 263, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 263, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "---", + "title_level": -1 + }, + "summary": "No content was provided to summarize. Please supply the text, image, or table you wish to have condensed." + }, + { + "index_id": 264, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 264, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "-Core Task & Rules-", + "title_level": -1 + }, + "summary": "The core task is to condense provided content (text, image, or table) into a self-contained, informative summary that captures key points and stands alone as easily understandable, using any provided title to identify the subject, while strictly beginning with the main conclusion and avoiding any introductory phrases describing the input format." + }, + { + "index_id": 265, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 265, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "1. **Analyze the \"New Entity\"**: Carefully read its name, type, and description to understand what it is.", + "title_level": -1 + }, + "summary": "The \"New Entity\" is a distinct subject defined by its specific name, classification type, and descriptive attributes, which must be carefully analyzed to fully understand its nature and purpose." + }, + { + "index_id": 266, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 266, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "2. **Field-by-Field Adjudication**: To determine a match, you must evaluate each field with a specific focus:", + "title_level": -1 + }, + "summary": "To determine a match, each field must be evaluated individually with a specific focus." + }, + { + "index_id": 267, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 267, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "* ** ' entity_name ' (High Importance):** The names must be extremely similar, a direct abbreviation (e.g., \"LLM\" vs. \"Large Language Model\"), or a well-known alias. **If the names represent distinct, parallel concepts (like \"Event Detection\" and \"Named Entity Recognition\"), they are NOT a match, even if their descriptions are very similar.**", + "title_level": -1 + }, + "summary": "Entity names are considered a match only if they are extremely similar, direct abbreviations of one another, or well-known aliases; names representing distinct, parallel concepts are not considered matches regardless of how similar their descriptions may be." + }, + { + "index_id": 268, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 268, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "* ** ' entity_type ' (Medium Importance):** The types do not need to be identical, but they must be closely related and compatible (e.g., ' COMPANY ' and ' ORGANIZATION ' could describe the same entity).", + "title_level": -1 + }, + "summary": "Entity types do not need to be identical but must be closely related and compatible, such as 'COMPANY' and 'ORGANIZATION' describing the same entity." + }, + { + "index_id": 269, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 269, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "* ** ' description ' (Contextual Importance):** The descriptions may differ as they are often extracted from different parts of a document. Your task is to look past surface-level text similarity and determine if they fundamentally describe the **same underlying object or concept**.", + "title_level": -1 + }, + "summary": "The core task is to evaluate whether different text descriptions refer to the same underlying object or concept, prioritizing fundamental semantic equivalence over superficial textual similarity, as descriptions often vary due to being extracted from different document sections." + }, + { + "index_id": 270, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 270, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "3. **Be Strict and Conservative**: Your standard for a match must be very high. An incorrect merge can corrupt the knowledge graph. A missed merge is less harmful.", + "title_level": -1 + }, + "summary": "Maintain a strict and conservative standard for merging records in a knowledge graph, as incorrect merges can corrupt the data while missed merges are less harmful." + }, + { + "index_id": 271, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 271, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "* Surface-level similarities are not enough. The underlying concepts must be identical.", + "title_level": -1 + }, + "summary": "True equivalence requires identical underlying concepts, not merely surface-level similarities." + }, + { + "index_id": 272, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 272, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "* For example, \"Apple\" (the fruit) and \"Apple Inc.\" (the company) are NOT a match.", + "title_level": -1 + }, + "summary": "Entities with identical names but distinct meanings, such as \"Apple\" the fruit versus \"Apple Inc.\" the company, are not considered a match." + }, + { + "index_id": 273, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 273, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "* **When in doubt, you MUST output -1.**", + "title_level": -1 + }, + "summary": "When in doubt, you MUST output -1." + }, + { + "index_id": 274, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 274, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "* **Assume No Match by Default**: In a large knowledge graph, most new entities are genuinely new. You should start with the assumption that the \"New Entity\" is unique. You must find **strong, convincing evidence** across all fields, especially the ' entity_name ' , to overturn this assumption and declare a match.", + "title_level": -1 + }, + "summary": "In large knowledge graphs, new entities should be presumed unique by default; a match should only be declared after finding strong, convincing evidence across all fields, particularly the entity name, to overturn this initial assumption." + }, + { + "index_id": 275, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 275, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "4. **Format the Output**: **You must provide your answer in a valid JSON format. The JSON object should contain two keys:**", + "title_level": -1 + }, + "summary": "{\n \"summary\": \"The output must be formatted as a valid JSON object containing exactly two keys, as specified in the provided instructions.\"\n}" + }, + { + "index_id": 276, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 276, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "* ' select_id ' : An integer. The ' id ' of the candidate you've determined to be an exact match. If no exact match is found, this value MUST be ' -1 ' .", + "title_level": -1 + }, + "summary": "The `select_id` field is an integer representing the ID of a candidate identified as an exact match; if no exact match is found, this value must be set to -1." + }, + { + "index_id": 277, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 277, + "pdf_para_block": { + "docling_label": "list_item" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "* ' explanation ' : A brief, one-sentence string explaining your reasoning. For a match, explain why they are the same entity. For no match, explain the key difference.", + "title_level": -1 + }, + "summary": "The provided content defines a specific data field named \"explanation\" as a concise, one-sentence string used to justify entity matching decisions by either clarifying why two entities are identical or highlighting their key differences." + }, + { + "index_id": 278, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 278, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "---", + "title_level": -1 + }, + "summary": "No content was provided to summarize. Please supply the text, image description, or table data you wish to have condensed." + }, + { + "index_id": 279, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 279, + "pdf_para_block": { + "docling_label": "section_header" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "-Output Schema & Format-", + "title_level": -1 + }, + "summary": "No content was provided to summarize; therefore, no core conclusion or summary can be generated." + }, + { + "index_id": 280, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 280, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Your response MUST be a single, valid JSON object that adheres to the following schema. Do not include any other text, explanation, or markdown formatting like ''' json.", + "title_level": -1 + }, + "summary": "{\n \"summary\": \"No content was provided to summarize.\"\n}" + }, + { + "index_id": 281, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 281, + "pdf_para_block": { + "docling_label": "code" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "''' json {{ \"select_id\": \"integer\", \"explanation\": \"string\" }} ''' ----Example-### Example 1: Match Found ### Example 2: No Match Found -----Task Execution-", + "title_level": -1 + }, + "summary": "The system processes requests by returning a structured JSON response containing an integer `select_id` and a string `explanation` to indicate the outcome, which is either a \"Match Found\" or \"No Match Found.\"" + }, + { + "index_id": 282, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 282, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Now, perform the selection task based on the following data. Remember to output only a single integer.", + "title_level": -1 + }, + "summary": "The provided content does not contain data to summarize; it is an instruction to perform a selection task and output a single integer, but no actual data or options are given to make that selection." + }, + { + "index_id": 283, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 283, + "pdf_para_block": { + "docling_label": "text" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "-Input Data -", + "title_level": -1 + }, + "summary": "No content was provided to summarize." + }, + { + "index_id": 284, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 284, + "pdf_para_block": { + "docling_label": "paragraph" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "Figure 13: The prompt for entity resolution judgement, examples are omitted due to lack of space.", + "title_level": -1 + }, + "summary": "Figure 13 illustrates the specific prompt design used to guide entity resolution judgments, though detailed examples are excluded from the figure to conserve space." + }, + { + "index_id": 285, + "parent_id": 239, + "type": "NodeType.TEXT", + "meta_info": { + "file_name": null, + "file_path": null, + "page_idx": 18, + "page_path": null, + "pdf_id": 285, + "pdf_para_block": { + "docling_label": "page_footer" + }, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": "19", + "title_level": -1 + }, + "summary": "The provided content consists solely of the number \"19\" and lacks sufficient context, text, or data to form a meaningful summary or draw any conclusions." + } + ], + "meta_info": { + "file_name": "BOOKRAG_VLDB_2026_full.pdf", + "file_path": "/Volumes/ExtMac/Projects/Exorty/BOOKRag/BOOKRAG_VLDB_2026_full.pdf", + "page_idx": null, + "page_path": null, + "pdf_id": null, + "pdf_para_block": null, + "img_path": null, + "image_width": 0, + "image_height": 0, + "caption": null, + "footnote": null, + "table_body": null, + "content": null, + "title_level": -1 + } +} \ No newline at end of file diff --git a/e2e_test_output/tree.pkl b/e2e_test_output/tree.pkl new file mode 100644 index 0000000..2816387 Binary files /dev/null and b/e2e_test_output/tree.pkl differ diff --git a/main.py b/main.py index dcd3097..c37e3aa 100644 --- a/main.py +++ b/main.py @@ -10,7 +10,7 @@ from Core.configs.system_config import load_system_config, SystemConfig from Core.configs.dataset_config import load_dataset_config, DatasetConfig from Core.construct_index import ( - construct_GBC_index, + construct_gbc_index, construct_vdb, compute_mm_reranker, rebuild_graph_vdb, @@ -128,13 +128,13 @@ def build_index(config: SystemConfig, stage: str = "all", data_df: pd.DataFrame if stage in ["tree", "all"]: log.info(" - STAGE: Building Document Tree...") # This function should build the tree and save it to config.save_path - construct_GBC_index(config, tree_only=True) + construct_gbc_index(config, tree_only=True) # Stage 2: Build the Knowledge Graph if stage in ["graph", "all"]: log.info(" - STAGE: Building Knowledge Graph...") # This function should LOAD the pre-existing tree and then build/save the graph - construct_GBC_index(config) + construct_gbc_index(config) # Stage 3: Build the Vector Database if stage in ["vdb", "all"]: diff --git a/run_e2e_test.py b/run_e2e_test.py new file mode 100644 index 0000000..36b7a56 --- /dev/null +++ b/run_e2e_test.py @@ -0,0 +1,51 @@ +"""Build knowledge graph from existing tree index.""" +import logging +import time + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s %(message)s", +) + +from Core.configs.system_config import load_system_config +from Core.Index.Tree import DocumentTree +from Core.pipelines.kg_builder import build_knowledge_graph +from Core.provider.TokenTracker import TokenTracker + +cfg = load_system_config("config/gbc.yaml") +cfg.pdf_path = "/Volumes/ExtMac/Projects/Exorty/BOOKRag/BOOKRAG_VLDB_2026_full.pdf" +cfg.save_path = "/Volumes/ExtMac/Projects/Exorty/BOOKRag/e2e_test_output" + +print("=== Config ===") +print(f"LLM: {cfg.llm.model_name} @ {cfg.llm.api_base}") +print(f"Extractor: {cfg.graph.extractor_type}") +print(f"Refine type: {cfg.graph.refine_type}") +print(f"Save: {cfg.save_path}") +print() + +# Load existing tree +tree_path = DocumentTree.get_save_path(cfg.save_path) +print(f"Loading tree from: {tree_path}") +tree = DocumentTree.load_from_file(tree_path) +print(f"Tree loaded: {len(tree.nodes)} nodes") +print() + +# Init token tracker +token_tracker = TokenTracker.get_instance() +token_tracker.reset() + +# Build KG +start = time.time() +graph_index = build_knowledge_graph(tree, cfg) +elapsed = time.time() - start + +# Save +graph_index.save_graph() + +print() +print(f"=== KG Done in {elapsed:.1f}s ===") +print(f"Total KG nodes: {len(graph_index.get_all_nodes())}") +print(f"Total KG edges: {graph_index.kg.number_of_edges()}") +print(f"Tree-to-KG mappings: {len(graph_index.tree2kg)}") +print(f"Token usage: {token_tracker.stage_history}") + diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/test_legal_heading_detector.py b/tests/test_legal_heading_detector.py new file mode 100644 index 0000000..5d21e36 --- /dev/null +++ b/tests/test_legal_heading_detector.py @@ -0,0 +1,167 @@ +"""Tests for Core/pipelines/legal_heading_detector.py""" +import pytest + +from Core.pipelines.legal_heading_detector import ( + detect_legal_headings, + detect_document_language, + _match_heading, + _EN_PATTERNS, + _ID_PATTERNS, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- +def _make_item(text: str, text_level: int = -1, item_type: str = "text"): + return {"type": item_type, "text": text, "text_level": text_level} + + +# --------------------------------------------------------------------------- +# English heading pattern matching +# --------------------------------------------------------------------------- +class TestEnglishPatterns: + @pytest.mark.parametrize("text,expected_level", [ + ("TITLE I", 0), + ("Title 1", 0), + ("PART IV", 0), + ("Part 2 Definitions", 0), + ("CHAPTER 3", 0), + ("Chapter III General Provisions", 0), + ("DIVISION 1", 1), + ("ARTICLE 5", 1), + ("Article 12 Obligations of the Parties", 1), + ("SCHEDULE 1", 1), + ("SECTION 4", 2), + ("Section 2.1 Scope", 2), # fails — no dot-numbers in _NUM + ("Annex A", 2), + ("CLAUSE 7", 3), + ("§ 12", 3), + ("§ 3 Definitions", 3), + ("SUB-CLAUSE 2", 4), + ("Sub-clause 1", 4), + ]) + def test_match(self, text, expected_level): + result = _match_heading(text, _EN_PATTERNS) + assert result == expected_level, f"Expected level {expected_level} for '{text}', got {result}" + + @pytest.mark.parametrize("text", [ + "This is a normal paragraph.", + "The Article discusses legal matters.", + "See Chapter 3 for more details.", + "", + "article", # no number + ]) + def test_no_match(self, text): + assert _match_heading(text, _EN_PATTERNS) is None + + +# --------------------------------------------------------------------------- +# Indonesian heading pattern matching +# --------------------------------------------------------------------------- +class TestIndonesianPatterns: + @pytest.mark.parametrize("text,expected_level", [ + ("BAB I", 0), + ("BAB IV KETENTUAN PERALIHAN", 0), + ("Bagian Kesatu Umum", 1), + ("Bagian Kedua Ruang Lingkup", 1), + ("Paragraf 1", 2), + ("Paragraf 2 Tata Cara", 2), + ("Pasal 1", 3), + ("Pasal 45", 3), + ("Ayat (1)", 4), + ("Ayat (2) Ketentuan", 4), + ]) + def test_match(self, text, expected_level): + result = _match_heading(text, _ID_PATTERNS) + assert result == expected_level, f"Expected level {expected_level} for '{text}', got {result}" + + @pytest.mark.parametrize("text", [ + "Mengenai pasal ini perlu diperhatikan.", + "Lihat BAB sebelumnya.", + "", + ]) + def test_no_match(self, text): + assert _match_heading(text, _ID_PATTERNS) is None + + +# --------------------------------------------------------------------------- +# detect_legal_headings integration +# --------------------------------------------------------------------------- +class TestDetectLegalHeadings: + def test_promotes_body_text_en(self): + pdf_list = [ + _make_item("CHAPTER 1 Introduction"), + _make_item("This is body text about the law."), + _make_item("Article 2 Definitions"), + _make_item("Some table content", item_type="table"), + ] + result = detect_legal_headings(pdf_list, lang="en") + assert result[0]["text_level"] == 0 # CHAPTER → level 0 + assert result[1]["text_level"] == -1 # body text unchanged + assert result[2]["text_level"] == 1 # Article → level 1 + assert "text_level" not in result[3] or result[3]["text_level"] == -1 # table skipped + + def test_does_not_override_existing_heading(self): + pdf_list = [_make_item("CHAPTER 1", text_level=2)] + detect_legal_headings(pdf_list, lang="en") + assert pdf_list[0]["text_level"] == 2 # not overridden + + def test_promotes_body_text_id(self): + pdf_list = [ + _make_item("BAB I KETENTUAN UMUM"), + _make_item("Pasal 1"), + _make_item("Dalam peraturan ini yang dimaksud dengan:"), + ] + result = detect_legal_headings(pdf_list, lang="id") + assert result[0]["text_level"] == 0 + assert result[1]["text_level"] == 3 + assert result[2]["text_level"] == -1 + + def test_unknown_lang_falls_back_to_en(self): + pdf_list = [_make_item("CHAPTER 1")] + detect_legal_headings(pdf_list, lang="xx") + assert pdf_list[0]["text_level"] == 0 + + def test_handles_none_items(self): + pdf_list = [None, _make_item("Article 1"), None] + detect_legal_headings(pdf_list, lang="en") + assert pdf_list[1]["text_level"] == 1 + + +# --------------------------------------------------------------------------- +# Auto language detection +# --------------------------------------------------------------------------- +class TestDetectDocumentLanguage: + def test_detects_english(self): + pdf_list = [ + _make_item("The quick brown fox jumps over the lazy dog. " * 10), + _make_item("This is a legal agreement between the parties. " * 10), + ] + lang = detect_document_language(pdf_list) + assert lang == "en" + + def test_detects_indonesian(self): + pdf_list = [ + _make_item("Dalam peraturan pemerintah ini yang dimaksud dengan " + "peraturan perundang-undangan adalah peraturan tertulis " + "yang memuat norma hukum yang mengikat secara umum. " * 5), + ] + lang = detect_document_language(pdf_list) + assert lang == "id" + + def test_fallback_on_empty(self): + pdf_list = [_make_item("Hi")] + lang = detect_document_language(pdf_list, fallback="id") + assert lang == "id" + + def test_skips_headings(self): + pdf_list = [ + _make_item("BAB I", text_level=0), + _make_item("Dalam peraturan ini yang dimaksud dengan peraturan " + "perundang-undangan adalah norma hukum yang berlaku. " * 5), + ] + lang = detect_document_language(pdf_list) + # Should detect from body text, not headings + assert lang == "id" + diff --git a/tests/test_ontology_integration.py b/tests/test_ontology_integration.py new file mode 100644 index 0000000..95c9b6b --- /dev/null +++ b/tests/test_ontology_integration.py @@ -0,0 +1,263 @@ +import json + +import yaml + +from Core.Index.Graph import Entity, Graph, Relationship +from Core.configs.entity_resolution_config import EntityResolutionConfig +from Core.configs.ontology_config import OntologyConfig +from Core.configs.system_config import load_system_config +from Core.utils.entity_resolution_utils import ( + build_global_entity_metadata, + should_resolve_entity_globally, +) +from Core.utils.ontology_utils import ( + align_entities_to_ontology, + find_best_graph_ontology_node, +) + + +def test_load_system_config_resolves_relative_ontology_path_and_merges_entities(tmp_path): + ontology_path = tmp_path / "ontology.yaml" + ontology_path.write_text( + yaml.safe_dump( + { + "entities": [ + { + "ontology_id": "product:file-backed", + "canonical_name": "file backed product", + "entity_type": "PRODUCT", + "description": "Loaded from ontology file.", + "aliases": ["fb product"], + } + ] + } + ), + encoding="utf-8", + ) + + config_path = tmp_path / "config.yaml" + config_path.write_text( + yaml.safe_dump( + { + "mineru": { + "backend": "vlm-sglang-client", + "method": "vlm", + "lang": "en", + }, + "rag": {"strategy": "gbc"}, + "ontology": { + "enabled": True, + "path": "ontology.yaml", + "entities": [ + { + "ontology_id": "product:inline", + "canonical_name": "inline product", + "entity_type": "PRODUCT", + "description": "Inline ontology entity.", + "aliases": ["inline alias"], + } + ], + }, + } + ), + encoding="utf-8", + ) + + cfg = load_system_config(str(config_path)) + + assert cfg.ontology.path == str(ontology_path.resolve()) + assert {entity.entity_id for entity in cfg.ontology.entities} == { + "product:inline", + "product:file-backed", + } + + +def test_align_entities_to_ontology_maps_entities_and_relationships(): + ontology_cfg = OntologyConfig( + enabled=True, + entities=[ + { + "ontology_id": "product:bookrag", + "canonical_name": "bookrag", + "entity_type": "PRODUCT", + "description": "The canonical BookRAG product entity.", + "aliases": ["book rag"], + } + ], + ) + entities = [ + Entity(entity_name="Book Rag", entity_type="product", description="Mentioned in text."), + Entity(entity_name="Retriever Engine", entity_type="system", description="Local component."), + ] + relationships = [ + Relationship( + src_entity_name="Book Rag", + tgt_entity_name="Retriever Engine", + relation_name="uses", + ) + ] + + aligned_entities, aligned_relationships = align_entities_to_ontology( + entities, relationships, ontology_cfg + ) + + canonical = next(entity for entity in aligned_entities if entity.entity_role == "canonical") + provisional = next(entity for entity in aligned_entities if entity.entity_role == "provisional") + assert canonical.entity_name == "bookrag" + assert canonical.canonical_id == "product:bookrag" + assert "book rag" in canonical.aliases + assert provisional.entity_name == "retriever engine" + assert aligned_relationships[0].src_entity_name == "bookrag" + assert aligned_relationships[0].tgt_entity_name == "retriever engine" + + +def test_align_entities_to_ontology_drops_unmatched_entities_when_provisional_disabled(): + ontology_cfg = OntologyConfig( + enabled=True, + allow_provisional_entities=False, + entities=[ + { + "ontology_id": "product:bookrag", + "canonical_name": "bookrag", + "entity_type": "PRODUCT", + "description": "The canonical BookRAG product entity.", + "aliases": ["book rag"], + } + ], + ) + entities = [ + Entity(entity_name="Book Rag", entity_type="product"), + Entity(entity_name="Unknown System", entity_type="system"), + ] + relationships = [ + Relationship( + src_entity_name="Book Rag", + tgt_entity_name="Unknown System", + relation_name="uses", + ) + ] + + aligned_entities, aligned_relationships = align_entities_to_ontology( + entities, relationships, ontology_cfg + ) + + assert [entity.entity_name for entity in aligned_entities] == ["bookrag"] + assert aligned_relationships == [] + + +def test_load_system_config_resolves_relative_entity_resolution_dir(tmp_path): + config_path = tmp_path / "config.yaml" + config_path.write_text( + yaml.safe_dump( + { + "mineru": { + "backend": "vlm-sglang-client", + "method": "vlm", + "lang": "en", + }, + "rag": {"strategy": "gbc"}, + "entity_resolution": { + "enabled": True, + "global_vdb_dir": "tenant_global_indices", + "canonical_only": True, + }, + } + ), + encoding="utf-8", + ) + + cfg = load_system_config(str(config_path)) + + assert cfg.entity_resolution.enabled is True + assert cfg.entity_resolution.canonical_only is True + assert cfg.entity_resolution.global_vdb_dir == str( + (tmp_path / "tenant_global_indices").resolve() + ) + + +def test_global_entity_resolution_helpers_preserve_ontology_metadata(): + resolution_cfg = EntityResolutionConfig(enabled=True, canonical_only=True) + canonical_entity = Entity( + entity_name="bookrag", + entity_type="PRODUCT", + description="Canonical product entity.", + entity_id="product:bookrag", + canonical_id="product:bookrag", + entity_role="canonical", + aliases=["bookrag", "book rag"], + mapping_confidence=1.0, + ontology_source="config", + ) + provisional_entity = Entity(entity_name="retriever", entity_type="SYSTEM") + + metadata = build_global_entity_metadata( + canonical_entity, tenant_id="tenant-a", doc_id="doc-1" + ) + + assert should_resolve_entity_globally(canonical_entity, resolution_cfg) is True + assert should_resolve_entity_globally(provisional_entity, resolution_cfg) is False + assert metadata["entity_id"] == "product:bookrag" + assert metadata["canonical_id"] == "product:bookrag" + assert metadata["tenant_id"] == "tenant-a" + assert metadata["doc_id"] == "doc-1" + assert json.loads(metadata["aliases_json"]) == ["bookrag", "book rag"] + + +def test_graph_update_entity_rewrites_edge_payload_names_and_tree_links(tmp_path): + graph = Graph(save_path=str(tmp_path)) + old_entity = Entity(entity_name="book rag", entity_type="PRODUCT", source_ids={1}) + other_entity = Entity(entity_name="retriever", entity_type="SYSTEM", source_ids={2}) + graph.add_and_link(tree_node_id=1, entities=old_entity) + graph.add_and_link(tree_node_id=2, entities=other_entity) + graph.add_kg_edge( + Relationship( + src_entity_name="book rag", + tgt_entity_name="retriever", + relation_name="uses", + ), + src_type="PRODUCT", + tgt_type="SYSTEM", + ) + + renamed_entity = Entity( + entity_name="bookrag", + entity_type="PRODUCT", + entity_role="canonical", + canonical_id="product:bookrag", + source_ids={1}, + ) + graph.update_entity("book rag", "PRODUCT", renamed_entity) + + new_node_name = graph.get_node_name_from_entity(renamed_entity) + other_node_name = graph.get_node_name_from_entity(other_entity) + edge_data = graph.kg.get_edge_data(new_node_name, other_node_name) + assert edge_data["src_entity_name"] == "bookrag" + assert graph.node_name_to_tree_nodes(new_node_name) == [1] + + +def test_graph_metadata_round_trip_and_ontology_lookup(tmp_path): + graph = Graph(save_path=str(tmp_path)) + entity = Entity( + entity_name="bookrag", + entity_type="PRODUCT", + description="Canonical product entity.", + entity_id="product:bookrag", + canonical_id="product:bookrag", + entity_role="canonical", + aliases=["bookrag", "book rag"], + mapping_confidence=1.0, + ontology_source="config", + source_ids={7}, + ) + graph.add_and_link(tree_node_id=7, entities=entity) + graph.save_graph() + + loaded = Graph.load_from_dir(str(tmp_path)) + loaded_entity = loaded.get_entity("bookrag", "PRODUCT") + metadata = loaded_entity.to_vdb_metadata() + + assert loaded_entity.canonical_id == "product:bookrag" + assert json.loads(metadata["aliases_json"]) == ["bookrag", "book rag"] + assert find_best_graph_ontology_node(loaded, "book rag", "product", threshold=1.0) == ( + loaded.get_node_name_from_entity(loaded_entity) + ) \ No newline at end of file diff --git a/tests/test_pdf_refiner_lang.py b/tests/test_pdf_refiner_lang.py new file mode 100644 index 0000000..79cca89 --- /dev/null +++ b/tests/test_pdf_refiner_lang.py @@ -0,0 +1,113 @@ +"""Tests for language-aware is_likely_incomplete_paragraph in pdf_refiner.py""" +import pytest + +from Core.pipelines.pdf_refiner import is_likely_incomplete_paragraph + + +class TestEnglishIncomplete: + """Existing English behaviour should be preserved.""" + + def test_complete_sentence(self): + assert is_likely_incomplete_paragraph( + 'He said, "This method is the best."', lang="en" + ) is False + + def test_incomplete_ending_and(self): + assert is_likely_incomplete_paragraph( + "The quick brown fox jumps over the lazy dog and", lang="en" + ) is True + + def test_incomplete_hyphen(self): + assert is_likely_incomplete_paragraph( + "The results demonstrate a signifi-", lang="en" + ) is True + + def test_incomplete_comma(self): + assert is_likely_incomplete_paragraph( + "In the following sections, we discuss the approach,", lang="en" + ) is True + + def test_complete_exclamation(self): + assert is_likely_incomplete_paragraph( + "This is absolutely correct for all cases!", lang="en" + ) is False + + def test_short_text_not_incomplete(self): + assert is_likely_incomplete_paragraph("Hello", lang="en") is False + + def test_empty_text(self): + assert is_likely_incomplete_paragraph("", lang="en") is False + + def test_connector_word_the(self): + assert is_likely_incomplete_paragraph( + "This regulation applies to all persons under the", lang="en" + ) is True + + +class TestIndonesianIncomplete: + """Indonesian-specific terminal punctuation and connector words.""" + + def test_complete_sentence(self): + assert is_likely_incomplete_paragraph( + "Peraturan ini berlaku sejak tanggal diundangkan.", lang="id" + ) is False + + def test_incomplete_no_period(self): + assert is_likely_incomplete_paragraph( + "Dalam peraturan pemerintah ini yang dimaksud dengan peraturan", lang="id" + ) is True + + def test_incomplete_connector_dan(self): + assert is_likely_incomplete_paragraph( + "Pasal ini mengatur tentang hak dan.", lang="id" + ) is True + + def test_incomplete_connector_yang(self): + assert is_likely_incomplete_paragraph( + "Setiap orang berhak atas perlindungan hukum yang.", lang="id" + ) is True + + def test_incomplete_connector_dengan(self): + assert is_likely_incomplete_paragraph( + "Peraturan ini disusun dengan memperhatikan ketentuan dengan.", lang="id" + ) is True + + def test_complete_question(self): + assert is_likely_incomplete_paragraph( + "Apakah peraturan ini sudah sesuai dengan undang-undang?", lang="id" + ) is False + + def test_incomplete_comma_id(self): + assert is_likely_incomplete_paragraph( + "Sebagaimana dimaksud dalam Pasal 1 ayat satu,", lang="id" + ) is True + + +class TestDefaultLang: + """When lang is omitted, should behave as English.""" + + def test_defaults_to_english(self): + assert is_likely_incomplete_paragraph( + "The quick brown fox jumps over the lazy dog and" + ) is True + + def test_defaults_complete(self): + assert is_likely_incomplete_paragraph( + "This sentence is complete and well-formed." + ) is False + + +class TestUnsupportedLang: + """Unsupported language should fall back to English rules.""" + + def test_fallback_terminal_punctuation(self): + # No terminal punctuation → incomplete even for unknown lang + assert is_likely_incomplete_paragraph( + "This sentence has no ending punctuation mark", lang="xx" + ) is True + + def test_fallback_complete(self): + assert is_likely_incomplete_paragraph( + "This sentence ends properly with a period.", lang="xx" + ) is False +